Coverage Report

Created: 2025-07-23 06:28

/src/libhevc/encoder/hme_subpel.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/**
22
******************************************************************************
23
* @file hme_subpel.c
24
*
25
* @brief
26
*    Subpel refinement modules for ME algo
27
*
28
* @author
29
*    Ittiam
30
*
31
*
32
* List of Functions
33
* hme_qpel_interp_avg()
34
* hme_subpel_refine_ctblist_bck()
35
* hme_subpel_refine_ctblist_fwd()
36
* hme_refine_bidirect()
37
* hme_subpel_refinement()
38
* hme_subpel_refine_ctb_fwd()
39
* hme_subpel_refine_ctb_bck()
40
* hme_create_bck_inp()
41
* hme_subpel_refine_search_node()
42
******************************************************************************
43
*/
44
45
/*****************************************************************************/
46
/* File Includes                                                             */
47
/*****************************************************************************/
48
/* System include files */
49
#include <stdio.h>
50
#include <string.h>
51
#include <stdlib.h>
52
#include <assert.h>
53
#include <stdarg.h>
54
#include <math.h>
55
#include <limits.h>
56
57
/* User include files */
58
#include "ihevc_typedefs.h"
59
#include "itt_video_api.h"
60
#include "ihevce_api.h"
61
62
#include "rc_cntrl_param.h"
63
#include "rc_frame_info_collector.h"
64
#include "rc_look_ahead_params.h"
65
66
#include "ihevc_defs.h"
67
#include "ihevc_structs.h"
68
#include "ihevc_platform_macros.h"
69
#include "ihevc_deblk.h"
70
#include "ihevc_itrans_recon.h"
71
#include "ihevc_chroma_itrans_recon.h"
72
#include "ihevc_chroma_intra_pred.h"
73
#include "ihevc_intra_pred.h"
74
#include "ihevc_inter_pred.h"
75
#include "ihevc_mem_fns.h"
76
#include "ihevc_padding.h"
77
#include "ihevc_weighted_pred.h"
78
#include "ihevc_sao.h"
79
#include "ihevc_resi_trans.h"
80
#include "ihevc_quant_iquant_ssd.h"
81
#include "ihevc_cabac_tables.h"
82
83
#include "ihevce_defs.h"
84
#include "ihevce_lap_enc_structs.h"
85
#include "ihevce_multi_thrd_structs.h"
86
#include "ihevce_multi_thrd_funcs.h"
87
#include "ihevce_me_common_defs.h"
88
#include "ihevce_had_satd.h"
89
#include "ihevce_error_codes.h"
90
#include "ihevce_bitstream.h"
91
#include "ihevce_cabac.h"
92
#include "ihevce_rdoq_macros.h"
93
#include "ihevce_function_selector.h"
94
#include "ihevce_enc_structs.h"
95
#include "ihevce_entropy_structs.h"
96
#include "ihevce_cmn_utils_instr_set_router.h"
97
#include "ihevce_enc_loop_structs.h"
98
#include "ihevce_bs_compute_ctb.h"
99
#include "ihevce_global_tables.h"
100
#include "ihevce_dep_mngr_interface.h"
101
#include "hme_datatype.h"
102
#include "hme_interface.h"
103
#include "hme_common_defs.h"
104
#include "hme_defs.h"
105
#include "ihevce_me_instr_set_router.h"
106
#include "hme_globals.h"
107
#include "hme_utils.h"
108
#include "hme_coarse.h"
109
#include "hme_fullpel.h"
110
#include "hme_subpel.h"
111
#include "hme_refine.h"
112
#include "hme_err_compute.h"
113
#include "hme_common_utils.h"
114
#include "hme_search_algo.h"
115
#include "ihevce_stasino_helpers.h"
116
#include "ihevce_common_utils.h"
117
118
/*****************************************************************************/
119
/* Function Definitions                                                      */
120
/*****************************************************************************/
121
void hme_qpel_interp_avg(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
122
12.5M
{
123
12.5M
    U08 *pu1_src1, *pu1_src2, *pu1_dst;
124
12.5M
    qpel_input_buf_cfg_t *ps_inp_cfg;
125
12.5M
    S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
126
127
    /*************************************************************************/
128
    /* For a given QPEL pt, we need to determine the 2 source pts that are   */
129
    /* needed to do the QPEL averaging. The logic to do this is as follows   */
130
    /* i4_mv_x and i4_mv_y are the motion vectors in QPEL units that are     */
131
    /* pointing to the pt of interest. Obviously, they are w.r.t. the 0,0    */
132
    /* pt of th reference blk that is colocated to the inp blk.              */
133
    /*    A j E k B                                                          */
134
    /*    l m n o p                                                          */
135
    /*    F q G r H                                                          */
136
    /*    s t u v w                                                          */
137
    /*    C x I y D                                                          */
138
    /* In above diagram, A. B, C, D are full pts at offsets (0,0),(1,0),(0,1)*/
139
    /* and (1,1) respectively in the fpel buffer (id = 0)                    */
140
    /* E and I are hxfy pts in offsets (0,0),(0,1) respectively in hxfy buf  */
141
    /* F and H are fxhy pts in offsets (0,0),(1,0) respectively in fxhy buf  */
142
    /* G is hxhy pt in offset 0,0 in hxhy buf                                */
143
    /* All above offsets are computed w.r.t. motion displaced pt in          */
144
    /* respective bufs. This means that A corresponds to (i4_mv_x >> 2) and  */
145
    /* (i4_mv_y >> 2) in fxfy buf. Ditto with E, F and G                     */
146
    /* fxfy buf is buf id 0, hxfy is buf id 1, fxhy is buf id 2, hxhy is 3   */
147
    /* If we consider pt v to be derived. v has a fractional comp of 3, 3    */
148
    /* v is avg of H and I. So the table look up of v should give following  */
149
    /* buf 1 (H) : offset = (1, 0) buf id = 2.                               */
150
    /* buf 2 (I) : offset = 0 , 1) buf id = 1.                               */
151
    /* NOTE: For pts that are fxfy/hxfy/fxhy/hxhy, bufid 1 will be -1.       */
152
    /*************************************************************************/
153
12.5M
    i4_mv_x_frac = i4_mv_x & 3;
154
12.5M
    i4_mv_y_frac = i4_mv_y & 3;
155
156
12.5M
    i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * ps_prms->i4_ref_stride;
157
158
    /* Derive the descriptor that has all offset and size info */
159
12.5M
    ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
160
161
12.5M
    if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
162
11.9M
    {
163
        /* This is case for fxfy/hxfy/fxhy/hxhy */
164
11.9M
        ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
165
11.9M
        ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
166
11.9M
        ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
167
11.9M
        ps_prms->i4_final_out_stride = ps_prms->i4_ref_stride;
168
169
11.9M
        return;
170
11.9M
    }
171
172
591k
    pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
173
591k
    pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
174
591k
    pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
175
176
591k
    pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
177
591k
    pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
178
591k
    pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * ps_prms->i4_ref_stride);
179
180
591k
    pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
181
591k
    hevc_avg_2d(
182
591k
        pu1_src1,
183
591k
        pu1_src2,
184
591k
        ps_prms->i4_ref_stride,
185
591k
        ps_prms->i4_ref_stride,
186
591k
        ps_prms->i4_blk_wd,
187
591k
        ps_prms->i4_blk_ht,
188
591k
        pu1_dst,
189
591k
        ps_prms->i4_out_stride);
190
591k
    ps_prms->pu1_final_out = pu1_dst;
191
591k
    ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
192
591k
}
193
194
static __inline void hme_qpel_interp_avg_2pt_vert_no_reuse(
195
    interp_prms_t *ps_prms,
196
    S32 i4_mv_x,
197
    S32 i4_mv_y,
198
    U08 **ppu1_final,
199
    S32 *pi4_final_stride,
200
    FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
201
33.5k
{
202
33.5k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
203
204
33.5k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
205
33.5k
}
206
207
static __inline void hme_qpel_interp_avg_2pt_horz_no_reuse(
208
    interp_prms_t *ps_prms,
209
    S32 i4_mv_x,
210
    S32 i4_mv_y,
211
    U08 **ppu1_final,
212
    S32 *pi4_final_stride,
213
    FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
214
27.5k
{
215
27.5k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
216
217
27.5k
    pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
218
27.5k
}
219
220
/********************************************************************************
221
*  @fn     hme_qpel_interp_comprehensive
222
*
223
*  @brief  Interpolates 2 qpel points by hpel averaging
224
*
225
*  @param[in,out]  ps_prms: Both input buffer ptrs and location of output
226
*
227
*  @param[in]  i4_mv_x : x component of motion vector in QPEL units
228
*
229
*  @param[in]  i4_mv_y : y component of motion vector in QPEL units
230
*
231
*  @param[in]  i4_grid_mask : mask which determines qpels to be computed
232
*
233
*  @param[out]  ppu1_final : storage for final buffer pointers
234
*
235
*  @param[out]  pi4_final_stride : storage for final buffer strides
236
*
237
*  @return None
238
********************************************************************************
239
*/
240
static __inline void hme_qpel_interp_comprehensive(
241
    interp_prms_t *ps_prms,
242
    U08 **ppu1_final,
243
    S32 *pi4_final_stride,
244
    S32 i4_mv_x,
245
    S32 i4_mv_y,
246
    S32 i4_grid_mask,
247
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
248
3.77M
{
249
3.77M
    S32 pt_select_for_TB, pt_select_for_LR;
250
3.77M
    S32 dx, dy, dydx;
251
3.77M
    S32 vert_func_selector, horz_func_selector;
252
253
3.77M
    S32 i4_ref_stride = ps_prms->i4_ref_stride;
254
255
3.77M
    pt_select_for_TB =
256
3.77M
        ((i4_grid_mask & (1 << PT_B)) >> PT_B) + ((i4_grid_mask & (1 << PT_T)) >> (PT_T - 1));
257
258
3.77M
    pt_select_for_LR =
259
3.77M
        ((i4_grid_mask & (1 << PT_R)) >> PT_R) + ((i4_grid_mask & (1 << PT_L)) >> (PT_L - 1));
260
261
3.77M
    dx = (i4_mv_x & 3);
262
3.77M
    dy = (i4_mv_y & 3);
263
3.77M
    dydx = (dx + (dy << 2));
264
265
3.77M
    vert_func_selector = gai4_select_qpel_function_vert[pt_select_for_TB][dydx];
266
3.77M
    horz_func_selector = gai4_select_qpel_function_horz[pt_select_for_LR][dydx];
267
268
    /* case descriptions */
269
    /* Let T = (gridmask & T) & B = (gridmask & B) */
270
    /* & hp = pt is an hpel or an fpel */
271
    /* & r = reuse possible */
272
    /* 0 => T || B = 0 */
273
    /* 1 => (!T) && (B) && hp */
274
    /* 2 => (T) && (!B) && hp */
275
    /* 3 => (!T) && (B) && !hp */
276
    /* 4 => (T) && (!B) && !hp */
277
    /* 5 => (T) && (B) && !hp && r */
278
    /* 6 => (T) && (B) && !hp && !r */
279
    /* 7 => (T) && (B) && hp */
280
281
3.77M
    switch(vert_func_selector)
282
3.77M
    {
283
0
    case 0:
284
0
    {
285
0
        break;
286
0
    }
287
55.9k
    case 1:
288
55.9k
    {
289
55.9k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
290
55.9k
        qpel_input_buf_cfg_t *ps_inp_cfg;
291
55.9k
        S32 i4_mvyp1 = (i4_mv_y + 1);
292
293
55.9k
        i4_mv_x_frac = dx;
294
55.9k
        i4_mv_y_frac = i4_mvyp1 & 3;
295
296
55.9k
        i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
297
298
        /* Derive the descriptor that has all offset and size info */
299
55.9k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
300
301
55.9k
        ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
302
55.9k
        ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
303
55.9k
        ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
304
55.9k
        pi4_final_stride[3] = i4_ref_stride;
305
306
55.9k
        break;
307
0
    }
308
53.3k
    case 2:
309
53.3k
    {
310
53.3k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
311
53.3k
        qpel_input_buf_cfg_t *ps_inp_cfg;
312
53.3k
        S32 i4_mvym1 = (i4_mv_y - 1);
313
314
53.3k
        i4_mv_x_frac = dx;
315
53.3k
        i4_mv_y_frac = i4_mvym1 & 3;
316
317
53.3k
        i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
318
319
        /* Derive the descriptor that has all offset and size info */
320
53.3k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
321
322
53.3k
        ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
323
53.3k
        ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
324
53.3k
        ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
325
53.3k
        pi4_final_stride[1] = i4_ref_stride;
326
327
53.3k
        break;
328
0
    }
329
16.4k
    case 3:
330
16.4k
    {
331
16.4k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
332
16.4k
            ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
333
334
16.4k
        break;
335
0
    }
336
14.9k
    case 4:
337
14.9k
    {
338
14.9k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
339
14.9k
            ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
340
341
14.9k
        break;
342
0
    }
343
3.60M
    case 5:
344
3.60M
    {
345
3.60M
        ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_vert_with_reuse(
346
3.60M
            ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
347
3.60M
        break;
348
0
    }
349
33.5k
    case 6:
350
33.5k
    {
351
33.5k
        hme_qpel_interp_avg_2pt_vert_no_reuse(
352
33.5k
            ps_prms,
353
33.5k
            i4_mv_x,
354
33.5k
            i4_mv_y,
355
33.5k
            ppu1_final,
356
33.5k
            pi4_final_stride,
357
33.5k
            ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
358
33.5k
        break;
359
0
    }
360
0
    case 7:
361
0
    {
362
0
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
363
0
        qpel_input_buf_cfg_t *ps_inp_cfg;
364
365
0
        S32 i4_mvyp1 = (i4_mv_y + 1);
366
0
        S32 i4_mvym1 = (i4_mv_y - 1);
367
368
0
        i4_mv_x_frac = dx;
369
0
        i4_mv_y_frac = i4_mvyp1 & 3;
370
371
0
        i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
372
373
        /* Derive the descriptor that has all offset and size info */
374
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
375
376
0
        ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
377
0
        ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
378
0
        ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
379
0
        pi4_final_stride[3] = i4_ref_stride;
380
381
0
        i4_mv_y_frac = i4_mvym1 & 3;
382
383
0
        i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
384
385
        /* Derive the descriptor that has all offset and size info */
386
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
387
388
0
        ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
389
0
        ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
390
0
        ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
391
0
        pi4_final_stride[1] = i4_ref_stride;
392
393
0
        break;
394
0
    }
395
3.77M
    }
396
397
    /* case descriptions */
398
    /* Let L = (gridmask & L) & R = (gridmask & R) */
399
    /* & hp = pt is an hpel or an fpel */
400
    /* & r = reuse possible */
401
    /* 0 => L || R = 0 */
402
    /* 1 => (!L) && (R) && hp */
403
    /* 2 => (L) && (!R) && hp */
404
    /* 3 => (!L) && (R) && !hp */
405
    /* 4 => (L) && (!R) && !hp */
406
    /* 5 => (L) && (R) && !hp && r */
407
    /* 6 => (L) && (R) && !hp && !r */
408
    /* 7 => (L) && (R) && hp */
409
410
3.77M
    switch(horz_func_selector)
411
3.77M
    {
412
0
    case 0:
413
0
    {
414
0
        break;
415
0
    }
416
52.1k
    case 1:
417
52.1k
    {
418
52.1k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
419
52.1k
        qpel_input_buf_cfg_t *ps_inp_cfg;
420
52.1k
        S32 i4_mvxp1 = (i4_mv_x + 1);
421
422
52.1k
        i4_mv_x_frac = i4_mvxp1 & 3;
423
52.1k
        i4_mv_y_frac = dy;
424
425
52.1k
        i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
426
427
        /* Derive the descriptor that has all offset and size info */
428
52.1k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
429
430
52.1k
        ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
431
52.1k
        ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
432
52.1k
        ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
433
52.1k
        pi4_final_stride[2] = i4_ref_stride;
434
435
52.1k
        break;
436
0
    }
437
49.0k
    case 2:
438
49.0k
    {
439
49.0k
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
440
49.0k
        qpel_input_buf_cfg_t *ps_inp_cfg;
441
49.0k
        S32 i4_mvxm1 = (i4_mv_x - 1);
442
443
49.0k
        i4_mv_x_frac = i4_mvxm1 & 3;
444
49.0k
        i4_mv_y_frac = dy;
445
446
49.0k
        i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
447
448
        /* Derive the descriptor that has all offset and size info */
449
49.0k
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
450
451
49.0k
        ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
452
49.0k
        ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
453
49.0k
        ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
454
49.0k
        pi4_final_stride[0] = i4_ref_stride;
455
456
49.0k
        break;
457
0
    }
458
20.3k
    case 3:
459
20.3k
    {
460
20.3k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
461
20.3k
            ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
462
463
20.3k
        break;
464
0
    }
465
18.9k
    case 4:
466
18.9k
    {
467
18.9k
        ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
468
18.9k
            ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
469
470
18.9k
        break;
471
0
    }
472
3.60M
    case 5:
473
3.60M
    {
474
3.60M
        ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_horz_with_reuse(
475
3.60M
            ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
476
3.60M
        break;
477
0
    }
478
27.5k
    case 6:
479
27.5k
    {
480
27.5k
        hme_qpel_interp_avg_2pt_horz_no_reuse(
481
27.5k
            ps_prms,
482
27.5k
            i4_mv_x,
483
27.5k
            i4_mv_y,
484
27.5k
            ppu1_final,
485
27.5k
            pi4_final_stride,
486
27.5k
            ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
487
27.5k
        break;
488
0
    }
489
0
    case 7:
490
0
    {
491
0
        S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
492
0
        qpel_input_buf_cfg_t *ps_inp_cfg;
493
494
0
        S32 i4_mvxp1 = (i4_mv_x + 1);
495
0
        S32 i4_mvxm1 = (i4_mv_x - 1);
496
497
0
        i4_mv_x_frac = i4_mvxp1 & 3;
498
0
        i4_mv_y_frac = dy;
499
500
0
        i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
501
502
        /* Derive the descriptor that has all offset and size info */
503
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
504
505
0
        ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
506
0
        ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
507
0
        ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
508
0
        pi4_final_stride[2] = i4_ref_stride;
509
510
0
        i4_mv_x_frac = i4_mvxm1 & 3;
511
512
0
        i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
513
514
        /* Derive the descriptor that has all offset and size info */
515
0
        ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
516
517
0
        ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
518
0
        ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
519
0
        ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
520
0
        pi4_final_stride[0] = i4_ref_stride;
521
522
0
        break;
523
0
    }
524
3.77M
    }
525
3.77M
}
526
527
/**
528
********************************************************************************
529
*  @fn     S32 hme_compute_pred_and_evaluate_bi(hme_subpel_prms_t *ps_prms,
530
*                                   search_results_t *ps_search_results,
531
*                                   layer_ctxt_t *ps_curr_layer,
532
*                                   U08 **ppu1_pred)
533
*
534
*
535
*  @brief  Evaluates the best bipred cost as avg(P0, P1) where P0 and P1 are
536
*          best L0 and L1 bufs respectively for the entire CU
537
*
538
*  @param[in]  ps_prms: subpel prms input to this function
539
*
540
*  @param[in] ps_curr_layer: points to the current layer ctxt
541
*
542
*  @return The best BI cost of best uni cost, whichever better
543
********************************************************************************
544
*/
545
void hme_compute_pred_and_evaluate_bi(
546
    inter_cu_results_t *ps_cu_results,
547
    inter_pu_results_t *ps_pu_results,
548
    inter_ctb_prms_t *ps_inter_ctb_prms,
549
    part_type_results_t *ps_part_type_result,
550
    ULWORD64 *pu8_winning_pred_sigmaXSquare,
551
    ULWORD64 *pu8_winning_pred_sigmaX,
552
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
553
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
554
7.29M
{
555
    /* Idx0 - Uni winner */
556
    /* Idx1 - Uni runner-up */
557
    /* Idx2 - Bi winner */
558
7.29M
    hme_pred_buf_info_t as_pred_buf_data[3][NUM_INTER_PU_PARTS];
559
7.29M
    err_prms_t s_err_prms;
560
7.29M
    interp_prms_t s_interp_prms;
561
562
7.29M
    PF_SAD_FXN_T pf_err_compute;
563
564
7.29M
    S32 i, j;
565
7.29M
    S32 x_off, y_off, x_pic, y_pic;
566
7.29M
    S32 i4_sad_grid;
567
7.29M
    U08 e_cu_size;
568
7.29M
    S32 i4_part_type;
569
7.29M
    U08 u1_cu_size;
570
7.29M
    S32 shift;
571
7.29M
    S32 x_part, y_part, num_parts;
572
7.29M
    S32 inp_stride, ref_stride;
573
7.29M
    U08 au1_pred_buf_array_indixes[3];
574
7.29M
    S32 cur_iter_best_cost;
575
7.29M
    S32 uni_cost, bi_cost, best_cost, tot_cost;
576
    /* Idx0 - Uni winner */
577
    /* Idx1 - Bi winner */
578
7.29M
    ULWORD64 au8_sigmaX[2][NUM_INTER_PU_PARTS];
579
7.29M
    ULWORD64 au8_sigmaXSquared[2][NUM_INTER_PU_PARTS];
580
7.29M
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
581
7.29M
    S32 i4_noise_term;
582
7.29M
#endif
583
584
7.29M
    interp_prms_t *ps_interp_prms = &s_interp_prms;
585
586
7.29M
    S32 best_cand_in_opp_dir_idx = 0;
587
7.29M
    S32 is_best_cand_an_intra = 0;
588
7.29M
    U08 u1_is_cu_noisy = ps_inter_ctb_prms->u1_is_cu_noisy;
589
7.29M
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
590
7.29M
    const S32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
591
7.29M
#endif
592
7.29M
    tot_cost = 0;
593
594
    /* Start of the CU w.r.t. CTB */
595
7.29M
    x_off = ps_cu_results->u1_x_off;
596
7.29M
    y_off = ps_cu_results->u1_y_off;
597
598
7.29M
    inp_stride = ps_inter_ctb_prms->i4_inp_stride;
599
7.29M
    ref_stride = ps_inter_ctb_prms->i4_rec_stride;
600
601
7.29M
    ps_interp_prms->i4_ref_stride = ref_stride;
602
603
    /* Start of the CU w.r.t. Pic 0,0 */
604
7.29M
    x_pic = x_off + ps_inter_ctb_prms->i4_ctb_x_off;
605
7.29M
    y_pic = y_off + ps_inter_ctb_prms->i4_ctb_y_off;
606
607
7.29M
    u1_cu_size = ps_cu_results->u1_cu_size;
608
7.29M
    e_cu_size = u1_cu_size;
609
7.29M
    shift = (S32)e_cu_size;
610
7.29M
    i4_part_type = ps_part_type_result->u1_part_type;
611
7.29M
    num_parts = gau1_num_parts_in_part_type[i4_part_type];
612
613
29.1M
    for(i = 0; i < 3; i++)
614
21.8M
    {
615
21.8M
        hme_init_pred_buf_info(
616
21.8M
            &as_pred_buf_data[i],
617
21.8M
            &ps_inter_ctb_prms->s_pred_buf_mngr,
618
21.8M
            (ps_part_type_result->as_pu_results->pu.b4_wd + 1) << 2,
619
21.8M
            (ps_part_type_result->as_pu_results->pu.b4_ht + 1) << 2,
620
21.8M
            (PART_TYPE_T)i4_part_type);
621
622
21.8M
        au1_pred_buf_array_indixes[i] = as_pred_buf_data[i][0].u1_pred_buf_array_id;
623
21.8M
    }
624
625
16.3M
    for(j = 0; j < num_parts; j++)
626
9.08M
    {
627
9.08M
        UWORD8 *apu1_hpel_ref[2][4];
628
9.08M
        PART_ID_T e_part_id;
629
9.08M
        BLK_SIZE_T e_blk_size;
630
9.08M
        WORD8 i1_ref_idx;
631
9.08M
        UWORD8 pred_dir;
632
9.08M
        WORD32 ref_offset, inp_offset, wd, ht;
633
9.08M
        pu_result_t *ps_pu_node1, *ps_pu_node2, *ps_pu_result;
634
9.08M
        mv_t *aps_mv[2];
635
9.08M
        UWORD8 num_active_ref_opp;
636
9.08M
        UWORD8 num_results_per_part;
637
9.08M
        WORD32 luma_weight_ref1, luma_offset_ref1;
638
9.08M
        WORD32 luma_weight_ref2, luma_offset_ref2;
639
9.08M
        WORD32 pu_node2_found = 0;
640
641
9.08M
        e_part_id = ge_part_type_to_part_id[i4_part_type][j];
642
9.08M
        e_blk_size = ge_part_id_to_blk_size[e_cu_size][e_part_id];
643
644
9.08M
        x_part = gas_part_attr_in_cu[e_part_id].u1_x_start << shift;
645
9.08M
        y_part = gas_part_attr_in_cu[e_part_id].u1_y_start << shift;
646
647
9.08M
        ref_offset = (x_part + x_pic) + (y_pic + y_part) * ref_stride;
648
9.08M
        inp_offset = (x_part + y_part * inp_stride) + ps_cu_results->i4_inp_offset;
649
650
9.08M
        pred_dir = ps_part_type_result->as_pu_results[j].pu.b2_pred_mode;
651
652
9.08M
        ps_pu_node1 = &(ps_part_type_result->as_pu_results[j]);
653
654
9.08M
        if(PRED_L0 == pred_dir)
655
8.27M
        {
656
8.27M
            i1_ref_idx = ps_pu_node1->pu.mv.i1_l0_ref_idx;
657
8.27M
            aps_mv[0] = &(ps_pu_node1->pu.mv.s_l0_mv);
658
659
8.27M
            num_active_ref_opp =
660
8.27M
                ps_inter_ctb_prms->u1_num_active_ref_l1 * (ps_inter_ctb_prms->i4_bidir_enabled);
661
8.27M
            num_results_per_part = ps_pu_results->u1_num_results_per_part_l0[e_part_id];
662
663
8.27M
            ps_pu_result = ps_pu_results->aps_pu_results[PRED_L0][e_part_id];
664
665
8.27M
            ASSERT(i1_ref_idx >= 0);
666
667
8.27M
            apu1_hpel_ref[0][0] =
668
8.27M
                (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
669
8.27M
                ref_offset;
670
8.27M
            apu1_hpel_ref[0][1] =
671
8.27M
                ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
672
8.27M
                ref_offset;
673
8.27M
            apu1_hpel_ref[0][2] =
674
8.27M
                ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
675
8.27M
                ref_offset;
676
8.27M
            apu1_hpel_ref[0][3] =
677
8.27M
                ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
678
8.27M
                ref_offset;
679
680
8.27M
            luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
681
8.27M
                                   ->s_weight_offset.i2_luma_weight;
682
8.27M
            luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
683
8.27M
                                   ->s_weight_offset.i2_luma_offset;
684
8.27M
        }
685
810k
        else
686
810k
        {
687
810k
            i1_ref_idx = ps_pu_node1->pu.mv.i1_l1_ref_idx;
688
810k
            aps_mv[0] = &(ps_pu_node1->pu.mv.s_l1_mv);
689
690
810k
            ASSERT(i1_ref_idx >= 0);
691
692
810k
            num_active_ref_opp =
693
810k
                ps_inter_ctb_prms->u1_num_active_ref_l0 * (ps_inter_ctb_prms->i4_bidir_enabled);
694
810k
            num_results_per_part = ps_pu_results->u1_num_results_per_part_l1[e_part_id];
695
696
810k
            ps_pu_result = ps_pu_results->aps_pu_results[PRED_L1][e_part_id];
697
698
810k
            apu1_hpel_ref[0][0] =
699
810k
                (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
700
810k
                ref_offset;
701
810k
            apu1_hpel_ref[0][1] =
702
810k
                ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
703
810k
                ref_offset;
704
810k
            apu1_hpel_ref[0][2] =
705
810k
                ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
706
810k
                ref_offset;
707
810k
            apu1_hpel_ref[0][3] =
708
810k
                ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
709
810k
                ref_offset;
710
711
810k
            luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
712
810k
                                   ->s_weight_offset.i2_luma_weight;
713
810k
            luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
714
810k
                                   ->s_weight_offset.i2_luma_offset;
715
810k
        }
716
717
9.08M
        if(aps_mv[0]->i2_mvx == INTRA_MV)
718
0
        {
719
0
            uni_cost = ps_pu_node1->i4_tot_cost;
720
0
            cur_iter_best_cost = ps_pu_node1->i4_tot_cost;
721
0
            best_cost = MIN(uni_cost, cur_iter_best_cost);
722
0
            tot_cost += best_cost;
723
0
            continue;
724
0
        }
725
726
9.08M
        ps_interp_prms->i4_blk_wd = wd = gau1_blk_size_to_wd[e_blk_size];
727
9.08M
        ps_interp_prms->i4_blk_ht = ht = gau1_blk_size_to_ht[e_blk_size];
728
9.08M
        ps_interp_prms->i4_out_stride = MAX_CU_SIZE;
729
730
9.08M
        if(num_active_ref_opp)
731
2.70M
        {
732
2.70M
            if(PRED_L0 == pred_dir)
733
1.94M
            {
734
1.94M
                if(ps_pu_results->u1_num_results_per_part_l1[e_part_id])
735
1.82M
                {
736
1.82M
                    ps_pu_node2 = ps_pu_results->aps_pu_results[1][e_part_id];
737
1.82M
                    pu_node2_found = 1;
738
1.82M
                }
739
1.94M
            }
740
762k
            else
741
762k
            {
742
762k
                if(ps_pu_results->u1_num_results_per_part_l0[e_part_id])
743
762k
                {
744
762k
                    ps_pu_node2 = ps_pu_results->aps_pu_results[0][e_part_id];
745
762k
                    pu_node2_found = 1;
746
762k
                }
747
762k
            }
748
2.70M
        }
749
750
9.08M
        if(!pu_node2_found)
751
6.49M
        {
752
6.49M
            bi_cost = INT_MAX >> 1;
753
754
6.49M
            s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
755
6.49M
            ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
756
757
6.49M
            ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
758
6.49M
                ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
759
760
6.49M
            if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
761
6.17M
            {
762
6.17M
                as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
763
6.17M
                as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
764
6.17M
                as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
765
6.17M
            }
766
767
6.49M
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
768
0
            {
769
0
                hme_compute_sigmaX_and_sigmaXSquared(
770
0
                    as_pred_buf_data[0][j].pu1_pred,
771
0
                    as_pred_buf_data[0][j].i4_pred_stride,
772
0
                    &au8_sigmaX[0][j],
773
0
                    &au8_sigmaXSquared[0][j],
774
0
                    ps_interp_prms->i4_blk_wd,
775
0
                    ps_interp_prms->i4_blk_ht,
776
0
                    ps_interp_prms->i4_blk_wd,
777
0
                    ps_interp_prms->i4_blk_ht,
778
0
                    0,
779
0
                    1);
780
0
            }
781
6.49M
        }
782
2.58M
        else
783
2.58M
        {
784
2.58M
            i = 0;
785
2.58M
            bi_cost = MAX_32BIT_VAL;
786
2.58M
            is_best_cand_an_intra = 0;
787
2.58M
            best_cand_in_opp_dir_idx = 0;
788
789
2.58M
            pred_dir = ps_pu_node2[i].pu.b2_pred_mode;
790
791
2.58M
            if(PRED_L0 == pred_dir)
792
762k
            {
793
762k
                i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l0_ref_idx;
794
762k
                aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l0_mv);
795
796
762k
                ASSERT(i1_ref_idx >= 0);
797
798
762k
                apu1_hpel_ref[1][0] =
799
762k
                    (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
800
762k
                                   ->s_yuv_buf_desc.pv_y_buf) +
801
762k
                    ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
802
762k
                apu1_hpel_ref[1][1] =
803
762k
                    ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
804
762k
                    ref_offset;
805
762k
                apu1_hpel_ref[1][2] =
806
762k
                    ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
807
762k
                    ref_offset;
808
762k
                apu1_hpel_ref[1][3] =
809
762k
                    ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
810
762k
                    ref_offset;
811
812
762k
                luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
813
762k
                                       ->s_weight_offset.i2_luma_weight;
814
762k
                luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
815
762k
                                       ->s_weight_offset.i2_luma_offset;
816
762k
            }
817
1.82M
            else
818
1.82M
            {
819
1.82M
                i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l1_ref_idx;
820
1.82M
                aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l1_mv);
821
822
1.82M
                ASSERT(i1_ref_idx >= 0);
823
824
1.82M
                apu1_hpel_ref[1][0] =
825
1.82M
                    (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
826
1.82M
                                   ->s_yuv_buf_desc.pv_y_buf) +
827
1.82M
                    ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
828
1.82M
                apu1_hpel_ref[1][1] =
829
1.82M
                    ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
830
1.82M
                    ref_offset;
831
1.82M
                apu1_hpel_ref[1][2] =
832
1.82M
                    ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
833
1.82M
                    ref_offset;
834
1.82M
                apu1_hpel_ref[1][3] =
835
1.82M
                    ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
836
1.82M
                    ref_offset;
837
838
1.82M
                luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
839
1.82M
                                       ->s_weight_offset.i2_luma_weight;
840
1.82M
                luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
841
1.82M
                                       ->s_weight_offset.i2_luma_offset;
842
1.82M
            }
843
844
2.58M
            if(aps_mv[1]->i2_mvx == INTRA_MV)
845
0
            {
846
0
                uni_cost = ps_pu_node1->i4_tot_cost;
847
0
                cur_iter_best_cost = ps_pu_node2[i].i4_tot_cost;
848
849
0
                if(cur_iter_best_cost < bi_cost)
850
0
                {
851
0
                    bi_cost = cur_iter_best_cost;
852
0
                    best_cand_in_opp_dir_idx = i;
853
0
                    is_best_cand_an_intra = 1;
854
0
                }
855
856
0
                best_cost = MIN(uni_cost, bi_cost);
857
0
                tot_cost += best_cost;
858
0
                continue;
859
0
            }
860
861
2.58M
            s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
862
2.58M
            ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
863
864
2.58M
            ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
865
2.58M
                ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
866
867
2.58M
            if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
868
2.49M
            {
869
2.49M
                as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
870
2.49M
                as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
871
2.49M
                as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
872
2.49M
            }
873
874
2.58M
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
875
0
            {
876
0
                hme_compute_sigmaX_and_sigmaXSquared(
877
0
                    as_pred_buf_data[0][j].pu1_pred,
878
0
                    as_pred_buf_data[0][j].i4_pred_stride,
879
0
                    &au8_sigmaX[0][j],
880
0
                    &au8_sigmaXSquared[0][j],
881
0
                    ps_interp_prms->i4_blk_wd,
882
0
                    ps_interp_prms->i4_blk_ht,
883
0
                    ps_interp_prms->i4_blk_wd,
884
0
                    ps_interp_prms->i4_blk_ht,
885
0
                    0,
886
0
                    1);
887
0
            }
888
889
2.58M
            s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[1][j].pu1_pred;
890
2.58M
            ps_interp_prms->ppu1_ref = &apu1_hpel_ref[1][0];
891
892
2.58M
            ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
893
2.58M
                ps_interp_prms, aps_mv[1]->i2_mvx, aps_mv[1]->i2_mvy, 0);
894
895
2.58M
            if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
896
2.49M
            {
897
2.49M
                as_pred_buf_data[1][j].u1_pred_buf_array_id = UCHAR_MAX;
898
2.49M
                as_pred_buf_data[1][j].pu1_pred = ps_interp_prms->pu1_final_out;
899
2.49M
                as_pred_buf_data[1][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
900
2.49M
            }
901
902
2.58M
            ps_cmn_utils_optimised_function_list->pf_wt_avg_2d(
903
2.58M
                as_pred_buf_data[0][j].pu1_pred,
904
2.58M
                as_pred_buf_data[1][j].pu1_pred,
905
2.58M
                as_pred_buf_data[0][j].i4_pred_stride,
906
2.58M
                as_pred_buf_data[1][j].i4_pred_stride,
907
2.58M
                wd,
908
2.58M
                ht,
909
2.58M
                as_pred_buf_data[2][j].pu1_pred,
910
2.58M
                as_pred_buf_data[2][j].i4_pred_stride,
911
2.58M
                luma_weight_ref1,
912
2.58M
                luma_weight_ref2,
913
2.58M
                luma_offset_ref1,
914
2.58M
                luma_offset_ref2,
915
2.58M
                ps_inter_ctb_prms->wpred_log_wdc);
916
917
2.58M
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
918
0
            {
919
0
                hme_compute_sigmaX_and_sigmaXSquared(
920
0
                    as_pred_buf_data[2][j].pu1_pred,
921
0
                    as_pred_buf_data[2][j].i4_pred_stride,
922
0
                    &au8_sigmaX[1][j],
923
0
                    &au8_sigmaXSquared[1][j],
924
0
                    ps_interp_prms->i4_blk_wd,
925
0
                    ps_interp_prms->i4_blk_ht,
926
0
                    ps_interp_prms->i4_blk_wd,
927
0
                    ps_interp_prms->i4_blk_ht,
928
0
                    0,
929
0
                    1);
930
0
            }
931
932
2.58M
            s_err_prms.pu1_inp = (U08 *)ps_inter_ctb_prms->pu1_non_wt_inp + inp_offset;
933
2.58M
            s_err_prms.i4_inp_stride = inp_stride;
934
2.58M
            s_err_prms.i4_ref_stride = as_pred_buf_data[2][j].i4_pred_stride;
935
2.58M
            s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
936
2.58M
            s_err_prms.i4_grid_mask = 1;
937
2.58M
            s_err_prms.pi4_sad_grid = &i4_sad_grid;
938
2.58M
            s_err_prms.i4_blk_wd = wd;
939
2.58M
            s_err_prms.i4_blk_ht = ht;
940
2.58M
            s_err_prms.pu1_ref = as_pred_buf_data[2][j].pu1_pred;
941
2.58M
            s_err_prms.ps_cmn_utils_optimised_function_list = ps_cmn_utils_optimised_function_list;
942
943
2.58M
            if(ps_inter_ctb_prms->u1_use_satd)
944
2.27M
            {
945
2.27M
                pf_err_compute = compute_satd_8bit;
946
2.27M
            }
947
310k
            else
948
310k
            {
949
310k
                pf_err_compute = ps_me_optimised_function_list->pf_evalsad_pt_npu_mxn_8bit;
950
310k
            }
951
952
2.58M
            pf_err_compute(&s_err_prms);
953
954
2.58M
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
955
2.58M
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
956
0
            {
957
0
                unsigned long u4_shift_val;
958
0
                ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
959
0
                ULWORD64 u8_temp_var, u8_temp_var1;
960
0
                S32 i4_bits_req;
961
962
0
                S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
963
964
0
                u8_pred_sigmaSquareX = (au8_sigmaX[1][j] * au8_sigmaX[1][j]);
965
0
                u8_pred_variance = au8_sigmaXSquared[1][j] - u8_pred_sigmaSquareX;
966
967
0
                if(e_cu_size == CU_8x8)
968
0
                {
969
0
                    PART_ID_T e_part_id =
970
0
                        (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
971
972
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
973
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaX,
974
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
975
0
                        &u8_src_variance,
976
0
                        i4_default_src_wt,
977
0
                        0,
978
0
                        ps_inter_ctb_prms->wpred_log_wdc,
979
0
                        e_part_id);
980
0
                }
981
0
                else
982
0
                {
983
0
                    u4_shift_val = ihevce_calc_stim_injected_variance(
984
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaX,
985
0
                        ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
986
0
                        &u8_src_variance,
987
0
                        i4_default_src_wt,
988
0
                        0,
989
0
                        ps_inter_ctb_prms->wpred_log_wdc,
990
0
                        e_part_id);
991
0
                }
992
993
0
                u8_pred_variance = u8_pred_variance >> u4_shift_val;
994
995
0
                GETRANGE64(i4_bits_req, u8_pred_variance);
996
997
0
                if(i4_bits_req > 27)
998
0
                {
999
0
                    u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1000
0
                    u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1001
0
                }
1002
1003
0
                if(u8_src_variance == u8_pred_variance)
1004
0
                {
1005
0
                    u8_temp_var = (1 << STIM_Q_FORMAT);
1006
0
                }
1007
0
                else
1008
0
                {
1009
0
                    u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1010
0
                    u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1011
0
                    u8_temp_var1 =
1012
0
                        (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1013
0
                    u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1014
0
                    u8_temp_var = (u8_temp_var / u8_temp_var1);
1015
0
                }
1016
1017
0
                i4_noise_term = (UWORD32)u8_temp_var;
1018
1019
0
                i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1020
1021
0
                ASSERT(i4_noise_term >= 0);
1022
1023
0
                u8_temp_var = i4_sad_grid;
1024
0
                u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1025
0
                u8_temp_var += (1 << ((i4_q_level)-1));
1026
0
                i4_sad_grid = (UWORD32)(u8_temp_var >> (i4_q_level));
1027
0
            }
1028
2.58M
#endif
1029
1030
2.58M
            cur_iter_best_cost = i4_sad_grid;
1031
2.58M
            cur_iter_best_cost += ps_pu_node1->i4_mv_cost;
1032
2.58M
            cur_iter_best_cost += ps_pu_node2[i].i4_mv_cost;
1033
1034
2.58M
            if(cur_iter_best_cost < bi_cost)
1035
2.58M
            {
1036
2.58M
                bi_cost = cur_iter_best_cost;
1037
2.58M
                best_cand_in_opp_dir_idx = i;
1038
2.58M
                is_best_cand_an_intra = 0;
1039
2.58M
            }
1040
2.58M
        }
1041
1042
9.08M
        uni_cost = ps_pu_node1->i4_tot_cost;
1043
1044
9.08M
#if USE_NOISE_TERM_DURING_BICAND_SEARCH
1045
9.08M
        if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1046
0
        {
1047
0
            unsigned long u4_shift_val;
1048
0
            ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
1049
0
            ULWORD64 u8_temp_var, u8_temp_var1;
1050
0
            S32 i4_bits_req;
1051
1052
0
            S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
1053
1054
0
            S08 i1_ref_idx =
1055
0
                (PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1056
0
                    ? ps_inter_ctb_prms->pi1_past_list[ps_pu_node1->pu.mv.i1_l0_ref_idx]
1057
0
                    : ps_inter_ctb_prms->pi1_future_list[ps_pu_node1->pu.mv.i1_l1_ref_idx];
1058
0
            S32 i4_sad = ps_pu_node1->i4_tot_cost - ps_pu_node1->i4_mv_cost;
1059
1060
0
            u8_pred_sigmaSquareX = (au8_sigmaX[0][j] * au8_sigmaX[0][j]);
1061
0
            u8_pred_variance = au8_sigmaXSquared[0][j] - u8_pred_sigmaSquareX;
1062
1063
0
            if(e_cu_size == CU_8x8)
1064
0
            {
1065
0
                PART_ID_T e_part_id =
1066
0
                    (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
1067
1068
0
                u4_shift_val = ihevce_calc_stim_injected_variance(
1069
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaX,
1070
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1071
0
                    &u8_src_variance,
1072
0
                    ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1073
0
                    ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1074
0
                    ps_inter_ctb_prms->wpred_log_wdc,
1075
0
                    e_part_id);
1076
0
            }
1077
0
            else
1078
0
            {
1079
0
                u4_shift_val = ihevce_calc_stim_injected_variance(
1080
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaX,
1081
0
                    ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1082
0
                    &u8_src_variance,
1083
0
                    ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1084
0
                    ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1085
0
                    ps_inter_ctb_prms->wpred_log_wdc,
1086
0
                    e_part_id);
1087
0
            }
1088
1089
0
            u8_pred_variance = u8_pred_variance >> (u4_shift_val);
1090
1091
0
            GETRANGE64(i4_bits_req, u8_pred_variance);
1092
1093
0
            if(i4_bits_req > 27)
1094
0
            {
1095
0
                u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1096
0
                u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1097
0
            }
1098
1099
0
            if(u8_src_variance == u8_pred_variance)
1100
0
            {
1101
0
                u8_temp_var = (1 << STIM_Q_FORMAT);
1102
0
            }
1103
0
            else
1104
0
            {
1105
0
                u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1106
0
                u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1107
0
                u8_temp_var1 =
1108
0
                    (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1109
0
                u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1110
0
                u8_temp_var = (u8_temp_var / u8_temp_var1);
1111
0
            }
1112
1113
0
            i4_noise_term = (UWORD32)u8_temp_var;
1114
1115
0
            i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1116
1117
0
            ASSERT(i4_noise_term >= 0);
1118
1119
0
            u8_temp_var = i4_sad;
1120
0
            u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1121
0
            u8_temp_var += (1 << ((i4_q_level)-1));
1122
0
            i4_sad = (UWORD32)(u8_temp_var >> (i4_q_level));
1123
1124
0
            uni_cost = i4_sad + ps_pu_node1->i4_mv_cost;
1125
1126
0
            pu8_winning_pred_sigmaX[j] = au8_sigmaX[0][j];
1127
0
            pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[0][j];
1128
0
        }
1129
9.08M
#endif
1130
1131
9.08M
        if((bi_cost < uni_cost) && (!is_best_cand_an_intra))
1132
67.8k
        {
1133
67.8k
            if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1134
0
            {
1135
0
                pu8_winning_pred_sigmaX[j] = au8_sigmaX[1][j];
1136
0
                pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[1][j];
1137
0
            }
1138
1139
67.8k
            if(PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1140
33.2k
            {
1141
33.2k
                ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1142
1143
33.2k
                if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1144
0
                {
1145
0
                    ps_pu_node1->pu.mv.i1_l1_ref_idx =
1146
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1147
0
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1148
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1149
0
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1150
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1151
0
                }
1152
33.2k
                else
1153
33.2k
                {
1154
33.2k
                    ps_pu_node1->pu.mv.i1_l1_ref_idx =
1155
33.2k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1156
33.2k
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1157
33.2k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1158
33.2k
                    ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1159
33.2k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1160
33.2k
                }
1161
33.2k
            }
1162
34.5k
            else
1163
34.5k
            {
1164
34.5k
                ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1165
1166
34.5k
                if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1167
34.5k
                {
1168
34.5k
                    ps_pu_node1->pu.mv.i1_l0_ref_idx =
1169
34.5k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1170
34.5k
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1171
34.5k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1172
34.5k
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1173
34.5k
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1174
34.5k
                }
1175
0
                else
1176
0
                {
1177
0
                    ps_pu_node1->pu.mv.i1_l0_ref_idx =
1178
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1179
0
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1180
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1181
0
                    ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1182
0
                        ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1183
0
                }
1184
34.5k
            }
1185
1186
67.8k
            ps_part_type_result->as_pu_results[j].i4_tot_cost = bi_cost;
1187
67.8k
        }
1188
1189
9.08M
        best_cost = MIN(uni_cost, bi_cost);
1190
9.08M
        tot_cost += best_cost;
1191
9.08M
    }
1192
1193
7.29M
    hme_debrief_bipred_eval(
1194
7.29M
        ps_part_type_result,
1195
7.29M
        as_pred_buf_data,
1196
7.29M
        &ps_inter_ctb_prms->s_pred_buf_mngr,
1197
7.29M
        au1_pred_buf_array_indixes,
1198
7.29M
        ps_cmn_utils_optimised_function_list);
1199
1200
7.29M
    ps_part_type_result->i4_tot_cost = tot_cost;
1201
7.29M
}
1202
1203
WORD32 hme_evalsatd_pt_pu_8x8_tu_rec(
1204
    err_prms_t *ps_prms,
1205
    WORD32 lambda,
1206
    WORD32 lambda_q_shift,
1207
    WORD32 i4_frm_qstep,
1208
    me_func_selector_t *ps_func_selector)
1209
3.10M
{
1210
3.10M
    S32 ai4_satd_4x4[4]; /* num 4x4s in a 8x8 */
1211
3.10M
    S32 i4_satd_8x8;
1212
3.10M
    S16 *pi2_had_out;
1213
3.10M
    S32 i4_tu_split_flag = 0;
1214
3.10M
    S32 i4_tu_early_cbf = 0;
1215
1216
3.10M
    S32 i4_early_cbf = 1;
1217
    //  S32 i4_i, i4_k;
1218
3.10M
    S32 i4_total_satd_cost = 0;
1219
3.10M
    S32 best_cost_tu_split;
1220
1221
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1222
3.10M
    S32 *api4_satd_pu[HAD_32x32 + 1];
1223
3.10M
    S32 *api4_tu_split[HAD_32x32 + 1];
1224
3.10M
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1225
1226
3.10M
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1227
3.10M
    S32 *pi4_tu_split = ps_prms->pi4_tu_split_flags;
1228
3.10M
    S32 *pi4_early_cbf = ps_prms->pi4_tu_early_cbf;
1229
1230
3.10M
    U08 *pu1_inp = ps_prms->pu1_inp;
1231
3.10M
    U08 *pu1_ref = ps_prms->pu1_ref;
1232
1233
3.10M
    S32 inp_stride = ps_prms->i4_inp_stride;
1234
3.10M
    S32 ref_stride = ps_prms->i4_ref_stride;
1235
1236
    /* Initialize tu_split_cost to "0" */
1237
3.10M
    ps_prms->i4_tu_split_cost = 0;
1238
3.10M
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1239
1240
3.10M
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1241
3.10M
    api4_satd_pu[HAD_8x8] = &i4_satd_8x8;
1242
3.10M
    api4_satd_pu[HAD_16x16] = NULL;
1243
3.10M
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1244
1245
3.10M
    api4_tu_split[HAD_4x4] = NULL;
1246
3.10M
    api4_tu_split[HAD_8x8] = &i4_tu_split_flag;
1247
3.10M
    api4_tu_split[HAD_16x16] = NULL;
1248
3.10M
    api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1249
1250
3.10M
    api4_tu_early_cbf[HAD_4x4] = NULL;
1251
3.10M
    api4_tu_early_cbf[HAD_8x8] = &i4_tu_early_cbf;
1252
3.10M
    api4_tu_early_cbf[HAD_16x16] = NULL;
1253
3.10M
    api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1254
1255
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1256
1257
    /* Return value is merge of both best_stad_cost and tu_split_flags */
1258
3.10M
    best_cost_tu_split = ps_func_selector->pf_had_8x8_using_4_4x4_r(
1259
3.10M
        pu1_inp,
1260
3.10M
        inp_stride,
1261
3.10M
        pu1_ref,
1262
3.10M
        ref_stride,
1263
3.10M
        pi2_had_out,
1264
3.10M
        8,
1265
3.10M
        api4_satd_pu,
1266
3.10M
        api4_tu_split,
1267
3.10M
        api4_tu_early_cbf,
1268
3.10M
        0,
1269
3.10M
        2,
1270
3.10M
        0,
1271
3.10M
        0,
1272
3.10M
        i4_frm_qstep,
1273
3.10M
        0,
1274
3.10M
        ps_prms->u1_max_tr_depth,
1275
3.10M
        ps_prms->u1_max_tr_size,
1276
3.10M
        &(ps_prms->i4_tu_split_cost),
1277
3.10M
        NULL);
1278
1279
    /* For SATD computation following TU size are assumed for a 8x8 CU */
1280
    /* 8 for 2Nx2N, 4 for Nx2N,2NxN                                    */
1281
1282
3.10M
    i4_total_satd_cost = best_cost_tu_split >> 2;
1283
1284
    /* Second last bit has the tu pslit flag */
1285
3.10M
    i4_tu_split_flag = (best_cost_tu_split & 0x3) >> 1;
1286
1287
    /* Last bit corrsponds to the Early CBF flag */
1288
3.10M
    i4_early_cbf = (best_cost_tu_split & 0x1);
1289
1290
    /* Update 8x8 SATDs */
1291
3.10M
    pi4_sad_grid[PART_ID_2Nx2N] = i4_satd_8x8;
1292
3.10M
    pi4_tu_split[PART_ID_2Nx2N] = i4_tu_split_flag;
1293
3.10M
    pi4_early_cbf[PART_ID_2Nx2N] = i4_early_cbf;
1294
1295
3.10M
    return i4_total_satd_cost;
1296
3.10M
}
1297
//#endif
1298
/**
1299
********************************************************************************
1300
*  @fn     S32 hme_evalsatd_update_1_best_result_pt_pu_16x16
1301
*
1302
*  @brief  Evaluates the SATD with partial updates for all the best partitions
1303
*          of a 16x16 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1304
*
1305
*  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1306
*                 pointer to sad grid of each partitions
1307
*
1308
*  @return     None
1309
********************************************************************************
1310
*/
1311
1312
void hme_evalsatd_update_2_best_results_pt_pu_16x16(
1313
    err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1314
0
{
1315
0
    S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1316
0
    S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1317
0
    S32 i4_satd_16x16; /* 16x16 satd cost     */
1318
0
    S32 i;
1319
0
    S16 ai2_8x8_had[256];
1320
0
    S16 *pi2_y0;
1321
0
    U08 *pu1_src, *pu1_pred;
1322
0
    S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1323
0
    S32 *ppi4_hsad;
1324
1325
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1326
0
    S32 *api4_satd_pu[HAD_32x32 + 1];
1327
0
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1328
1329
0
    U08 *pu1_inp = ps_prms->pu1_inp;
1330
0
    U08 *pu1_ref = ps_prms->pu1_ref;
1331
1332
0
    S32 inp_stride = ps_prms->i4_inp_stride;
1333
0
    S32 ref_stride = ps_prms->i4_ref_stride;
1334
1335
0
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1336
0
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1337
0
    api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1338
0
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1339
1340
0
    ppi4_hsad = api4_satd_pu[HAD_16x16];
1341
1342
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1343
0
    for(i = 0; i < 4; i++)
1344
0
    {
1345
0
        pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1346
0
        pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1347
0
        pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1348
0
        pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1349
1350
0
        ihevce_had_8x8_using_4_4x4(
1351
0
            pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1352
0
    }
1353
1354
    /* For SATD computation following TU size are assumed for a 16x16 CU */
1355
    /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
1356
1357
    /* Update 8x8 SATDs */
1358
    /* Modified to cost calculation using only 4x4 SATD */
1359
1360
    //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1361
    //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1362
    //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1363
    //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1364
1365
    /* Update 16x16 SATDs */
1366
0
    pi4_sad_grid[PART_ID_2Nx2N] =
1367
0
        ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1368
1369
0
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1370
0
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1371
0
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1372
0
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1373
1374
    /* Update 8x16 / 16x8 SATDs */
1375
0
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1376
0
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1377
0
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1378
0
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1379
1380
    /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
1381
0
    pi4_sad_grid[PART_ID_nLx2N_L] =
1382
0
        ai4_satd_4x4[0] + ai4_satd_4x4[4] + ai4_satd_4x4[8] + ai4_satd_4x4[12];
1383
1384
0
    pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_4x4[1] + ai4_satd_4x4[5] + ai4_satd_4x4[9] +
1385
0
                                    ai4_satd_4x4[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1386
1387
0
    pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_4x4[2] + ai4_satd_4x4[6] + ai4_satd_4x4[10] +
1388
0
                                    ai4_satd_4x4[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1389
1390
0
    pi4_sad_grid[PART_ID_nRx2N_R] =
1391
0
        ai4_satd_4x4[3] + ai4_satd_4x4[7] + ai4_satd_4x4[11] + ai4_satd_4x4[15];
1392
1393
0
    pi4_sad_grid[PART_ID_2NxnU_T] =
1394
0
        ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[2] + ai4_satd_4x4[3];
1395
1396
0
    pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_4x4[4] + ai4_satd_4x4[5] + ai4_satd_4x4[6] +
1397
0
                                    ai4_satd_4x4[7] + pi4_sad_grid[PART_ID_2NxN_B];
1398
1399
0
    pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[10] +
1400
0
                                    ai4_satd_4x4[11] + pi4_sad_grid[PART_ID_2NxN_T];
1401
1402
0
    pi4_sad_grid[PART_ID_2NxnD_B] =
1403
0
        ai4_satd_4x4[12] + ai4_satd_4x4[13] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1404
1405
    /* Call the update results function */
1406
0
    {
1407
0
        S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1408
0
        mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1409
0
        S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1410
0
        S32 best_node_cost;
1411
0
        S32 second_best_node_cost;
1412
1413
        /*For each valid partition, update the refine_prm structure to reflect the best and second
1414
        best candidates for that partition*/
1415
1416
0
        for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1417
0
        {
1418
0
            S32 update_required = 0;
1419
0
            S32 part_id = pi4_valid_part_ids[i4_count];
1420
0
            S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1421
1422
            /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1423
0
            i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1424
1425
            /*Calculate total cost*/
1426
0
            i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1427
0
            i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1428
1429
            /*****************************************************************/
1430
            /* We do not labor through the results if the total cost worse   */
1431
            /* than the last of the results.                                 */
1432
            /*****************************************************************/
1433
0
            best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1434
0
            second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
1435
1436
0
            if(i4_tot_cost < second_best_node_cost)
1437
0
            {
1438
0
                update_required = 2;
1439
1440
                /*************************************************************/
1441
                /* Identify where the current result isto be placed.Basically*/
1442
                /* find the node which has cost just higher thannodeundertest*/
1443
                /*************************************************************/
1444
0
                if(i4_tot_cost < best_node_cost)
1445
0
                {
1446
0
                    update_required = 1;
1447
0
                }
1448
0
                else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1449
0
                {
1450
0
                    update_required = 0;
1451
0
                }
1452
0
                if(update_required == 2)
1453
0
                {
1454
0
                    ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1455
0
                    ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1456
0
                    ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1457
0
                    ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1458
0
                    ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1459
0
                }
1460
0
                else if(update_required == 1)
1461
0
                {
1462
0
                    ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
1463
0
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index];
1464
0
                    ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
1465
0
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1466
0
                    ps_subpel_refine_ctxt->i2_mv_x[1][index] =
1467
0
                        ps_subpel_refine_ctxt->i2_mv_x[0][index];
1468
0
                    ps_subpel_refine_ctxt->i2_mv_y[1][index] =
1469
0
                        ps_subpel_refine_ctxt->i2_mv_y[0][index];
1470
0
                    ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
1471
0
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index];
1472
1473
0
                    ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1474
0
                    ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1475
0
                    ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1476
0
                    ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1477
0
                    ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1478
0
                }
1479
0
            }
1480
0
        }
1481
0
    }
1482
0
}
1483
1484
//#if COMPUTE_16x16_R == C
1485
void hme_evalsatd_update_1_best_result_pt_pu_16x16(
1486
    err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1487
21.1M
{
1488
21.1M
    S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1489
21.1M
    S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1490
21.1M
    S32 i4_satd_16x16; /* 16x16 satd cost     */
1491
21.1M
    S32 i;
1492
21.1M
    S16 ai2_8x8_had[256];
1493
21.1M
    S16 *pi2_y0;
1494
21.1M
    U08 *pu1_src, *pu1_pred;
1495
21.1M
    S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1496
21.1M
    S32 *ppi4_hsad;
1497
1498
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1499
21.1M
    S32 *api4_satd_pu[HAD_32x32 + 1];
1500
21.1M
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1501
1502
21.1M
    U08 *pu1_inp = ps_prms->pu1_inp;
1503
21.1M
    U08 *pu1_ref = ps_prms->pu1_ref;
1504
1505
21.1M
    S32 inp_stride = ps_prms->i4_inp_stride;
1506
21.1M
    S32 ref_stride = ps_prms->i4_ref_stride;
1507
1508
21.1M
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1509
21.1M
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1510
21.1M
    api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1511
21.1M
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1512
1513
21.1M
    ppi4_hsad = api4_satd_pu[HAD_16x16];
1514
1515
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1516
105M
    for(i = 0; i < 4; i++)
1517
84.6M
    {
1518
84.6M
        pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1519
84.6M
        pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1520
84.6M
        pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1521
84.6M
        pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1522
1523
84.6M
        ihevce_had_8x8_using_4_4x4(
1524
84.6M
            pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1525
84.6M
    }
1526
1527
    /* For SATD computation following TU size are assumed for a 16x16 CU */
1528
    /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
1529
1530
    /* Update 8x8 SATDs */
1531
    /* Modified to cost calculation using only 4x4 SATD */
1532
1533
    //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1534
    //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1535
    //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1536
    //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1537
1538
    /* Update 16x16 SATDs */
1539
21.1M
    pi4_sad_grid[PART_ID_2Nx2N] =
1540
21.1M
        ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1541
1542
21.1M
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1543
21.1M
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1544
21.1M
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1545
21.1M
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1546
1547
    /* Update 8x16 / 16x8 SATDs */
1548
21.1M
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1549
21.1M
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1550
21.1M
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1551
21.1M
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1552
1553
    /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
1554
21.1M
    pi4_sad_grid[PART_ID_nLx2N_L] =
1555
21.1M
        ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
1556
21.1M
    pi4_sad_grid[PART_ID_nRx2N_R] =
1557
21.1M
        ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
1558
21.1M
    pi4_sad_grid[PART_ID_2NxnU_T] =
1559
21.1M
        ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1560
21.1M
    pi4_sad_grid[PART_ID_2NxnD_B] =
1561
21.1M
        ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1562
1563
21.1M
    pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
1564
21.1M
    pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
1565
21.1M
    pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
1566
21.1M
    pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
1567
1568
    /* Call the update results function */
1569
21.1M
    {
1570
21.1M
        S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1571
21.1M
        mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1572
21.1M
        S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1573
21.1M
        S32 best_node_cost;
1574
21.1M
        S32 second_best_node_cost;
1575
1576
        /*For each valid partition, update the refine_prm structure to reflect the best and second
1577
        best candidates for that partition*/
1578
1579
291M
        for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1580
269M
        {
1581
269M
            S32 update_required = 0;
1582
269M
            S32 part_id = pi4_valid_part_ids[i4_count];
1583
269M
            S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1584
1585
            /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1586
269M
            i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1587
1588
            /*Calculate total cost*/
1589
269M
            i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1590
269M
            i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1591
1592
            /*****************************************************************/
1593
            /* We do not labor through the results if the total cost worse   */
1594
            /* than the last of the results.                                 */
1595
            /*****************************************************************/
1596
269M
            best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1597
269M
            second_best_node_cost = SHRT_MAX;
1598
1599
269M
            if(i4_tot_cost < second_best_node_cost)
1600
269M
            {
1601
269M
                update_required = 0;
1602
1603
                /*************************************************************/
1604
                /* Identify where the current result isto be placed.Basically*/
1605
                /* find the node which has cost just higher thannodeundertest*/
1606
                /*************************************************************/
1607
269M
                if(i4_tot_cost < best_node_cost)
1608
5.44M
                {
1609
5.44M
                    update_required = 1;
1610
5.44M
                }
1611
264M
                else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1612
203M
                {
1613
203M
                    update_required = 0;
1614
203M
                }
1615
269M
                if(update_required == 2)
1616
0
                {
1617
0
                    ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1618
0
                    ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1619
0
                    ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1620
0
                    ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1621
0
                    ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1622
0
                }
1623
269M
                else if(update_required == 1)
1624
5.44M
                {
1625
5.44M
                    ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1626
5.44M
                    ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1627
5.44M
                    ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1628
5.44M
                    ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1629
5.44M
                    ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1630
5.44M
                }
1631
269M
            }
1632
269M
        }
1633
21.1M
    }
1634
21.1M
}
1635
1636
WORD32 hme_evalsatd_pt_pu_16x16_tu_rec(
1637
    err_prms_t *ps_prms,
1638
    WORD32 lambda,
1639
    WORD32 lambda_q_shift,
1640
    WORD32 i4_frm_qstep,
1641
    me_func_selector_t *ps_func_selector)
1642
3.11M
{
1643
3.11M
    S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1644
3.11M
    S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1645
3.11M
    S32 ai4_tu_split_8x8[16];
1646
3.11M
    S32 i4_satd_16x16; /* 16x16 satd cost     */
1647
1648
3.11M
    S32 ai4_tu_early_cbf_8x8[16];
1649
1650
    //S16 ai2_had_out[256];
1651
3.11M
    S16 *pi2_had_out;
1652
3.11M
    S32 tu_split_flag = 0;
1653
3.11M
    S32 early_cbf_flag = 0;
1654
3.11M
    S32 total_satd_cost = 0;
1655
1656
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1657
3.11M
    S32 *api4_satd_pu[HAD_32x32 + 1];
1658
3.11M
    S32 *api4_tu_split[HAD_32x32 + 1];
1659
3.11M
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1660
1661
3.11M
    U08 *pu1_inp = ps_prms->pu1_inp;
1662
3.11M
    U08 *pu1_ref = ps_prms->pu1_ref;
1663
1664
3.11M
    S32 inp_stride = ps_prms->i4_inp_stride;
1665
3.11M
    S32 ref_stride = ps_prms->i4_ref_stride;
1666
1667
    /* Initialize tu_split_cost to "0" */
1668
3.11M
    ps_prms->i4_tu_split_cost = 0;
1669
1670
3.11M
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1671
1672
3.11M
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1673
3.11M
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1674
3.11M
    api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1675
3.11M
    api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1676
1677
3.11M
    api4_tu_split[HAD_4x4] = NULL;
1678
3.11M
    api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1679
3.11M
    api4_tu_split[HAD_16x16] = &tu_split_flag;
1680
3.11M
    api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1681
1682
3.11M
    api4_tu_early_cbf[HAD_4x4] = NULL;
1683
3.11M
    api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1684
3.11M
    api4_tu_early_cbf[HAD_16x16] = &early_cbf_flag;
1685
3.11M
    api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1686
1687
    /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1688
3.11M
    ps_func_selector->pf_had_16x16_r(
1689
3.11M
        pu1_inp,
1690
3.11M
        inp_stride,
1691
3.11M
        pu1_ref,
1692
3.11M
        ref_stride,
1693
3.11M
        pi2_had_out,
1694
3.11M
        16,
1695
3.11M
        api4_satd_pu,
1696
3.11M
        api4_tu_split,
1697
3.11M
        api4_tu_early_cbf,
1698
3.11M
        0,
1699
3.11M
        4,
1700
3.11M
        lambda,
1701
3.11M
        lambda_q_shift,
1702
3.11M
        i4_frm_qstep,
1703
3.11M
        0,
1704
3.11M
        ps_prms->u1_max_tr_depth,
1705
3.11M
        ps_prms->u1_max_tr_size,
1706
3.11M
        &(ps_prms->i4_tu_split_cost),
1707
3.11M
        NULL);
1708
1709
3.11M
    total_satd_cost = i4_satd_16x16;
1710
1711
3.11M
    ps_prms->pi4_tu_split_flags[0] = tu_split_flag;
1712
1713
3.11M
    ps_prms->pi4_tu_early_cbf[0] = early_cbf_flag;
1714
1715
3.11M
    return total_satd_cost;
1716
3.11M
}
1717
1718
/**
1719
********************************************************************************
1720
*  @fn     S32 hme_evalsatd_pt_pu_32x32
1721
*
1722
*  @brief  Evaluates the SATD with partial updates for all the best partitions
1723
*          of a 32x32 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1724
*
1725
*  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1726
*                 pointer to sad grid of each partitions
1727
*
1728
*  @return     None
1729
********************************************************************************
1730
*/
1731
void hme_evalsatd_pt_pu_32x32(err_prms_t *ps_prms)
1732
437k
{
1733
    //S32 ai4_satd_4x4[64];   /* num 4x4s in a 32x32 */
1734
437k
    S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1735
437k
    S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1736
437k
    S32 i4_satd_32x32;
1737
    //    S16 ai2_had_out[32*32];
1738
437k
    U08 *pu1_src;
1739
437k
    U08 *pu1_pred;
1740
437k
    S32 i;
1741
1742
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1743
437k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1744
437k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1745
1746
437k
    U08 *pu1_inp = ps_prms->pu1_inp;
1747
437k
    U08 *pu1_ref = ps_prms->pu1_ref;
1748
1749
437k
    S32 inp_stride = ps_prms->i4_inp_stride;
1750
437k
    S32 ref_stride = ps_prms->i4_ref_stride;
1751
1752
    //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[0];
1753
437k
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1754
437k
    api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1755
437k
    api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1756
1757
    /* 32x32 SATD is calculates as the sum of the 4 8x8's in the block */
1758
7.43M
    for(i = 0; i < 16; i++)
1759
7.00M
    {
1760
7.00M
        pu1_src = pu1_inp + ((i & 0x3) << 3) + ((i >> 2) * inp_stride * 8);
1761
1762
7.00M
        pu1_pred = pu1_ref + ((i & 0x3) << 3) + ((i >> 2) * ref_stride * 8);
1763
1764
7.00M
        ai4_satd_8x8[i] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1765
7.00M
            pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1766
7.00M
    }
1767
1768
    /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1769
437k
    ai4_satd_16x16[0] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[4] + ai4_satd_8x8[5];
1770
437k
    ai4_satd_16x16[1] = ai4_satd_8x8[2] + ai4_satd_8x8[3] + ai4_satd_8x8[6] + ai4_satd_8x8[7];
1771
437k
    ai4_satd_16x16[2] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[12] + ai4_satd_8x8[13];
1772
437k
    ai4_satd_16x16[3] = ai4_satd_8x8[10] + ai4_satd_8x8[11] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1773
1774
    /* Update 32x32 SATD */
1775
437k
    pi4_sad_grid[PART_ID_2Nx2N] =
1776
437k
        ai4_satd_16x16[0] + ai4_satd_16x16[1] + ai4_satd_16x16[2] + ai4_satd_16x16[3];
1777
1778
    /* Update 16x16 SATDs */
1779
437k
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_16x16[0];
1780
437k
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_16x16[1];
1781
437k
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_16x16[2];
1782
437k
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_16x16[3];
1783
1784
    /* Update 16x32 / 32x16 SATDs */
1785
437k
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_16x16[0] + ai4_satd_16x16[2];
1786
437k
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_16x16[1] + ai4_satd_16x16[3];
1787
437k
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_16x16[0] + ai4_satd_16x16[1];
1788
437k
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_16x16[2] + ai4_satd_16x16[3];
1789
1790
    /* Update AMP SATDs 32x24,32x8, 24x32,8x32  */
1791
437k
    pi4_sad_grid[PART_ID_nLx2N_L] =
1792
437k
        ai4_satd_8x8[0] + ai4_satd_8x8[4] + ai4_satd_8x8[8] + ai4_satd_8x8[12];
1793
1794
437k
    pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[5] + ai4_satd_8x8[9] +
1795
437k
                                    ai4_satd_8x8[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1796
1797
437k
    pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_8x8[2] + ai4_satd_8x8[6] + ai4_satd_8x8[10] +
1798
437k
                                    ai4_satd_8x8[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1799
1800
437k
    pi4_sad_grid[PART_ID_nRx2N_R] =
1801
437k
        ai4_satd_8x8[3] + ai4_satd_8x8[7] + ai4_satd_8x8[11] + ai4_satd_8x8[15];
1802
1803
437k
    pi4_sad_grid[PART_ID_2NxnU_T] =
1804
437k
        ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1805
1806
437k
    pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_8x8[4] + ai4_satd_8x8[5] + ai4_satd_8x8[6] +
1807
437k
                                    ai4_satd_8x8[7] + pi4_sad_grid[PART_ID_2NxN_B];
1808
1809
437k
    pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[10] +
1810
437k
                                    ai4_satd_8x8[11] + pi4_sad_grid[PART_ID_2NxN_T];
1811
1812
437k
    pi4_sad_grid[PART_ID_2NxnD_B] =
1813
437k
        ai4_satd_8x8[12] + ai4_satd_8x8[13] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1814
437k
}
1815
1816
WORD32 hme_evalsatd_pt_pu_32x32_tu_rec(
1817
    err_prms_t *ps_prms,
1818
    WORD32 lambda,
1819
    WORD32 lambda_q_shift,
1820
    WORD32 i4_frm_qstep,
1821
    me_func_selector_t *ps_func_selector)
1822
991k
{
1823
991k
    S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */
1824
991k
    S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1825
991k
    S32 ai4_tu_split_8x8[16];
1826
991k
    S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1827
991k
    S32 ai4_tu_split_16x16[4];
1828
991k
    S32 i4_satd_32x32;
1829
1830
991k
    S32 ai4_tu_early_cbf_8x8[16];
1831
991k
    S32 ai4_tu_early_cbf_16x16[4];
1832
991k
    S32 early_cbf_flag;
1833
1834
991k
    S16 *pi2_had_out;
1835
1836
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1837
991k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1838
991k
    S32 *api4_tu_split[HAD_32x32 + 1];
1839
991k
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1840
1841
991k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1842
991k
    S32 *pi4_tu_split_flag = ps_prms->pi4_tu_split_flags;
1843
991k
    S32 *pi4_tu_early_cbf = ps_prms->pi4_tu_early_cbf;
1844
1845
991k
    S32 tu_split_flag = 0;
1846
991k
    S32 total_satd_cost = 0;
1847
1848
991k
    U08 *pu1_inp = ps_prms->pu1_inp;
1849
991k
    U08 *pu1_ref = ps_prms->pu1_ref;
1850
1851
991k
    S32 inp_stride = ps_prms->i4_inp_stride;
1852
991k
    S32 ref_stride = ps_prms->i4_ref_stride;
1853
1854
    /* Initialize tu_split_cost to "0" */
1855
991k
    ps_prms->i4_tu_split_cost = 0;
1856
1857
991k
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1858
1859
991k
    api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1860
991k
    api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1861
991k
    api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1862
991k
    api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1863
1864
991k
    api4_tu_split[HAD_4x4] = NULL;
1865
991k
    api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1866
991k
    api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
1867
991k
    api4_tu_split[HAD_32x32] = &tu_split_flag;
1868
1869
991k
    api4_tu_early_cbf[HAD_4x4] = NULL;
1870
991k
    api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1871
991k
    api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
1872
991k
    api4_tu_early_cbf[HAD_32x32] = &early_cbf_flag;
1873
1874
    /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
1875
991k
    ihevce_had_32x32_r(
1876
991k
        pu1_inp,
1877
991k
        inp_stride,
1878
991k
        pu1_ref,
1879
991k
        ref_stride,
1880
991k
        pi2_had_out,
1881
991k
        32,
1882
991k
        api4_satd_pu,
1883
991k
        api4_tu_split,
1884
991k
        api4_tu_early_cbf,
1885
991k
        0,
1886
991k
        8,
1887
991k
        lambda,
1888
991k
        lambda_q_shift,
1889
991k
        i4_frm_qstep,
1890
991k
        0,
1891
991k
        ps_prms->u1_max_tr_depth,
1892
991k
        ps_prms->u1_max_tr_size,
1893
991k
        &(ps_prms->i4_tu_split_cost),
1894
991k
        ps_func_selector);
1895
1896
991k
    total_satd_cost = i4_satd_32x32;
1897
1898
    /*The structure of the TU_SPLIT flag for the current 32x32 is as follows
1899
    TL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1900
    TR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1901
    BL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1902
    BR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1903
    32x32_split - 1bit (LSBit)
1904
1905
    TU_SPLIT : (TL_16x16)_(TR_16x16)_(BL_16x16)_(BR_16x16)_32x32_split (21bits)*/
1906
1907
991k
    pi4_sad_grid[PART_ID_2Nx2N] = total_satd_cost;
1908
991k
    pi4_tu_split_flag[PART_ID_2Nx2N] = tu_split_flag;
1909
991k
    pi4_tu_early_cbf[PART_ID_2Nx2N] = early_cbf_flag;
1910
1911
991k
    return total_satd_cost;
1912
991k
}
1913
1914
/**
1915
********************************************************************************
1916
*  @fn     S32 hme_evalsatd_pt_pu_64x64
1917
*
1918
*  @brief  Evaluates the SATD with partial updates for all the best partitions
1919
*          of a 64x64 CU based on accumulated Hadamard 32x32 and 16x16 satds
1920
*
1921
*           Note : 64x64 SATD does not do hadamard Transform using 32x32 hadamard
1922
*                  outputs but directly uses four 32x32 SATD and 16 16x16 SATDS as
1923
*                  TU size of 64 is not supported in HEVC
1924
*
1925
*  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1926
*                 pointer to sad grid of each partitions
1927
*
1928
*  @return     None
1929
********************************************************************************
1930
*/
1931
1932
void hme_evalsatd_pt_pu_64x64(err_prms_t *ps_prms)
1933
55.9k
{
1934
    //S32 ai4_satd_4x4[4][64];   /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
1935
55.9k
    S32 ai4_satd_8x8[4][16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
1936
55.9k
    S32 ai4_satd_16x16[4][4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
1937
55.9k
    S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
1938
    //    S16 ai2_had_out[32*32];
1939
55.9k
    S32 i, j;
1940
1941
    //  S32 ai4_tu_split_8x8[4][16];
1942
    //  S32 ai4_tu_split_16x16[4][4];
1943
    //  S32 ai4_tu_split_32x32[4];
1944
1945
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1946
55.9k
    S32 *api4_satd_pu[HAD_32x32 + 1];
1947
    //  S32 *api4_tu_split[HAD_32x32 + 1];
1948
1949
55.9k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1950
1951
55.9k
    U08 *pu1_inp = ps_prms->pu1_inp;
1952
55.9k
    U08 *pu1_ref = ps_prms->pu1_ref;
1953
55.9k
    U08 *pu1_src;
1954
55.9k
    U08 *pu1_pred;
1955
1956
55.9k
    S32 inp_stride = ps_prms->i4_inp_stride;
1957
55.9k
    S32 ref_stride = ps_prms->i4_ref_stride;
1958
1959
279k
    for(i = 0; i < 4; i++)
1960
223k
    {
1961
223k
        S32 blkx = (i & 0x1);
1962
223k
        S32 blky = (i >> 1);
1963
223k
        U08 *pu1_pi0, *pu1_pi1;
1964
1965
        //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[i][0];
1966
223k
        api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[i][0];
1967
223k
        api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[i][0];
1968
223k
        api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
1969
1970
223k
        pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
1971
223k
        pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
1972
1973
        /* 64x64 SATD is calculates as the sum of the 4 16x16's in the block */
1974
3.80M
        for(j = 0; j < 16; j++)
1975
3.58M
        {
1976
3.58M
            pu1_src = pu1_pi0 + ((j & 0x3) << 3) + ((j >> 2) * inp_stride * 8);
1977
1978
3.58M
            pu1_pred = pu1_pi1 + ((j & 0x3) << 3) + ((j >> 2) * ref_stride * 8);
1979
1980
3.58M
            ai4_satd_8x8[i][j] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1981
3.58M
                pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1982
3.58M
        }
1983
1984
        /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1985
223k
        ai4_satd_16x16[i][0] =
1986
223k
            ai4_satd_8x8[i][0] + ai4_satd_8x8[i][1] + ai4_satd_8x8[i][4] + ai4_satd_8x8[i][5];
1987
223k
        ai4_satd_16x16[i][1] =
1988
223k
            ai4_satd_8x8[i][2] + ai4_satd_8x8[i][3] + ai4_satd_8x8[i][6] + ai4_satd_8x8[i][7];
1989
223k
        ai4_satd_16x16[i][2] =
1990
223k
            ai4_satd_8x8[i][8] + ai4_satd_8x8[i][9] + ai4_satd_8x8[i][12] + ai4_satd_8x8[i][13];
1991
223k
        ai4_satd_16x16[i][3] =
1992
223k
            ai4_satd_8x8[i][10] + ai4_satd_8x8[i][11] + ai4_satd_8x8[i][14] + ai4_satd_8x8[i][15];
1993
223k
    }
1994
1995
    /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1996
1997
55.9k
    ai4_satd_32x32[0] =
1998
55.9k
        ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3];
1999
55.9k
    ai4_satd_32x32[1] =
2000
55.9k
        ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1] + ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3];
2001
55.9k
    ai4_satd_32x32[2] =
2002
55.9k
        ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3];
2003
55.9k
    ai4_satd_32x32[3] =
2004
55.9k
        ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2005
2006
    /* Update 64x64 SATDs */
2007
55.9k
    pi4_sad_grid[PART_ID_2Nx2N] =
2008
55.9k
        ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2009
2010
    /* Update 32x32 SATDs */
2011
55.9k
    pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_32x32[0];
2012
55.9k
    pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_32x32[1];
2013
55.9k
    pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_32x32[2];
2014
55.9k
    pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_32x32[3];
2015
2016
    /* Update 32x64 / 64x32 SATDs */
2017
55.9k
    pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_32x32[0] + ai4_satd_32x32[2];
2018
55.9k
    pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_32x32[1] + ai4_satd_32x32[3];
2019
55.9k
    pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_32x32[0] + ai4_satd_32x32[1];
2020
55.9k
    pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_32x32[2] + ai4_satd_32x32[3];
2021
2022
    /* Update AMP SATDs 64x48,64x16, 48x64,16x64  */
2023
55.9k
    pi4_sad_grid[PART_ID_nLx2N_L] =
2024
55.9k
        ai4_satd_16x16[0][0] + ai4_satd_16x16[0][2] + ai4_satd_16x16[2][0] + ai4_satd_16x16[2][2];
2025
2026
55.9k
    pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_16x16[0][1] + ai4_satd_16x16[0][3] +
2027
55.9k
                                    ai4_satd_16x16[2][1] + ai4_satd_16x16[2][3] +
2028
55.9k
                                    pi4_sad_grid[PART_ID_Nx2N_R];
2029
2030
55.9k
    pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_16x16[1][0] + ai4_satd_16x16[1][2] +
2031
55.9k
                                    ai4_satd_16x16[3][0] + ai4_satd_16x16[3][2] +
2032
55.9k
                                    pi4_sad_grid[PART_ID_Nx2N_L];
2033
2034
55.9k
    pi4_sad_grid[PART_ID_nRx2N_R] =
2035
55.9k
        ai4_satd_16x16[1][1] + ai4_satd_16x16[1][3] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][3];
2036
2037
55.9k
    pi4_sad_grid[PART_ID_2NxnU_T] =
2038
55.9k
        ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1];
2039
2040
55.9k
    pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3] +
2041
55.9k
                                    ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3] +
2042
55.9k
                                    pi4_sad_grid[PART_ID_2NxN_B];
2043
2044
55.9k
    pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] +
2045
55.9k
                                    ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] +
2046
55.9k
                                    pi4_sad_grid[PART_ID_2NxN_T];
2047
2048
55.9k
    pi4_sad_grid[PART_ID_2NxnD_B] =
2049
55.9k
        ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2050
55.9k
}
2051
2052
WORD32 hme_evalsatd_pt_pu_64x64_tu_rec(
2053
    err_prms_t *ps_prms,
2054
    WORD32 lambda,
2055
    WORD32 lambda_q_shift,
2056
    WORD32 i4_frm_qstep,
2057
    me_func_selector_t *ps_func_selector)
2058
83.5k
{
2059
83.5k
    S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
2060
83.5k
    S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
2061
83.5k
    S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
2062
83.5k
    S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
2063
2064
83.5k
    S32 ai4_tu_split_8x8[16];
2065
83.5k
    S32 ai4_tu_split_16x16[4];
2066
2067
83.5k
    S32 ai4_tu_early_cbf_8x8[16];
2068
83.5k
    S32 ai4_tu_early_cbf_16x16[4];
2069
2070
83.5k
    S16 *pi2_had_out;
2071
83.5k
    S32 i;
2072
2073
    /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
2074
83.5k
    S32 *api4_satd_pu[HAD_32x32 + 1];
2075
83.5k
    S32 *api4_tu_split[HAD_32x32 + 1];
2076
83.5k
    S32 *api4_tu_early_cbf[HAD_32x32 + 1];
2077
2078
83.5k
    S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
2079
2080
83.5k
    S32 tu_split_flag = 0;
2081
83.5k
    S32 total_satd_cost = 0;
2082
2083
83.5k
    U08 *pu1_inp = ps_prms->pu1_inp;
2084
83.5k
    U08 *pu1_ref = ps_prms->pu1_ref;
2085
2086
83.5k
    S32 inp_stride = ps_prms->i4_inp_stride;
2087
83.5k
    S32 ref_stride = ps_prms->i4_ref_stride;
2088
2089
    /* Initialize tu_split_cost to "0" */
2090
83.5k
    ps_prms->i4_tu_split_cost = 0;
2091
2092
83.5k
    pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
2093
2094
417k
    for(i = 0; i < 4; i++)
2095
334k
    {
2096
334k
        S32 blkx = (i & 0x1);
2097
334k
        S32 blky = (i >> 1);
2098
334k
        U08 *pu1_pi0, *pu1_pi1;
2099
334k
        tu_split_flag = 0;
2100
2101
334k
        api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
2102
334k
        api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
2103
334k
        api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
2104
334k
        api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
2105
2106
334k
        api4_tu_split[HAD_4x4] = NULL;
2107
334k
        api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
2108
334k
        api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
2109
334k
        api4_tu_split[HAD_32x32] = &ps_prms->pi4_tu_split_flags[i];
2110
2111
334k
        api4_tu_early_cbf[HAD_4x4] = NULL;
2112
334k
        api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
2113
334k
        api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
2114
334k
        api4_tu_early_cbf[HAD_32x32] = &ps_prms->pi4_tu_early_cbf[i];
2115
2116
334k
        pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
2117
334k
        pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
2118
2119
        /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
2120
334k
        ihevce_had_32x32_r(
2121
334k
            pu1_pi0,
2122
334k
            inp_stride,
2123
334k
            pu1_pi1,
2124
334k
            ref_stride,
2125
334k
            pi2_had_out,
2126
334k
            32,
2127
334k
            api4_satd_pu,
2128
334k
            api4_tu_split,
2129
334k
            api4_tu_early_cbf,
2130
334k
            0,
2131
334k
            8,
2132
334k
            lambda,
2133
334k
            lambda_q_shift,
2134
334k
            i4_frm_qstep,
2135
334k
            1,
2136
334k
            ps_prms->u1_max_tr_depth,
2137
334k
            ps_prms->u1_max_tr_size,
2138
334k
            &(ps_prms->i4_tu_split_cost),
2139
334k
            ps_func_selector);
2140
334k
    }
2141
2142
83.5k
    total_satd_cost = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2143
2144
    /* Update 64x64 SATDs */
2145
83.5k
    pi4_sad_grid[PART_ID_2Nx2N] =
2146
83.5k
        ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2147
2148
83.5k
    return total_satd_cost;
2149
83.5k
}
2150
2151
/**
2152
********************************************************************************
2153
*  @fn     void hme_subpel_refine_search_node(search_node_t *ps_search_node,
2154
*                                   hme_subpel_prms_t *ps_prms,
2155
*                                   layer_ctxt_t *ps_curr_layer,
2156
*                                   BLK_SIZE_T e_blk_size,
2157
*                                   S32 x_off,
2158
*                                   S32 y_off)
2159
*
2160
*  @brief  Refines a given partition within a CU
2161
*
2162
*  @param[in,out]  ps_search_node: supplies starting mv and also ref id.
2163
*                   updated with the accurate subpel mv
2164
*
2165
*  @param[in]  ps_prms: subpel prms input to this function
2166
*
2167
*  @param[in]  ps_curr_layer : layer context
2168
*
2169
*  @param[in]  e_blk_size : Block size enumeration
2170
*
2171
*  @param[in]  x_off : x offset of the partition w.r.t. pic start
2172
*
2173
*  @param[in]  y_off : y offset of the partition w.r.t. pic start
2174
*
2175
*  @return None
2176
********************************************************************************
2177
*/
2178
2179
static __inline PF_SAD_RESULT_FXN_T hme_get_calc_sad_and_result_subpel_fxn(
2180
    me_func_selector_t *ps_func_selector,
2181
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list,
2182
    S32 i4_part_mask,
2183
    U08 u1_use_satd,
2184
    U08 u1_num_parts,
2185
    U08 u1_num_results)
2186
3.67M
{
2187
3.67M
    PF_SAD_RESULT_FXN_T pf_err_compute;
2188
2189
3.67M
    ASSERT((1 == u1_num_results) || (2 == u1_num_results));
2190
2191
3.67M
    if(1 == u1_num_results)
2192
3.67M
    {
2193
3.67M
        if(u1_use_satd)
2194
2.17M
        {
2195
2.17M
            if(u1_num_parts == 1)
2196
536k
            {
2197
536k
                pf_err_compute =
2198
536k
                    ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_eq_1;
2199
536k
            }
2200
1.63M
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2201
57.2k
            {
2202
57.2k
                pf_err_compute =
2203
57.2k
                    ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_9;
2204
57.2k
            }
2205
1.57M
            else
2206
1.57M
            {
2207
1.57M
                pf_err_compute =
2208
1.57M
                    ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_17;
2209
1.57M
            }
2210
2.17M
        }
2211
1.49M
        else
2212
1.49M
        {
2213
1.49M
            if(u1_num_parts == 1)
2214
1.41M
            {
2215
1.41M
                pf_err_compute = ps_me_optimised_function_list
2216
1.41M
                                     ->pf_calc_sad_and_1_best_result_subpel_num_part_eq_1;
2217
1.41M
            }
2218
85.5k
            else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2219
11.3k
            {
2220
11.3k
                pf_err_compute =
2221
11.3k
                    ps_me_optimised_function_list->pf_calc_sad_and_1_best_result_subpel_square_parts;
2222
11.3k
            }
2223
74.1k
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2224
56.1k
            {
2225
56.1k
                pf_err_compute = ps_me_optimised_function_list
2226
56.1k
                                     ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_9;
2227
56.1k
            }
2228
18.0k
            else
2229
18.0k
            {
2230
18.0k
                pf_err_compute = ps_me_optimised_function_list
2231
18.0k
                                     ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_17;
2232
18.0k
            }
2233
1.49M
        }
2234
3.67M
    }
2235
0
    else
2236
0
    {
2237
0
        if(u1_use_satd)
2238
0
        {
2239
0
            if(u1_num_parts == 1)
2240
0
            {
2241
0
                pf_err_compute =
2242
0
                    ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_eq_1;
2243
0
            }
2244
0
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2245
0
            {
2246
0
                pf_err_compute =
2247
0
                    ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_9;
2248
0
            }
2249
0
            else
2250
0
            {
2251
0
                pf_err_compute =
2252
0
                    ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_17;
2253
0
            }
2254
0
        }
2255
0
        else
2256
0
        {
2257
0
            if(u1_num_parts == 1)
2258
0
            {
2259
0
                pf_err_compute = ps_me_optimised_function_list
2260
0
                                     ->pf_calc_sad_and_2_best_results_subpel_num_part_eq_1;
2261
0
            }
2262
0
            else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2263
0
            {
2264
0
                pf_err_compute = ps_me_optimised_function_list
2265
0
                                     ->pf_calc_sad_and_2_best_results_subpel_square_parts;
2266
0
            }
2267
0
            else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2268
0
            {
2269
0
                pf_err_compute = ps_me_optimised_function_list
2270
0
                                     ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_9;
2271
0
            }
2272
0
            else
2273
0
            {
2274
0
                pf_err_compute = ps_me_optimised_function_list
2275
0
                                     ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_17;
2276
0
            }
2277
0
        }
2278
0
    }
2279
2280
3.67M
    return pf_err_compute;
2281
3.67M
}
2282
2283
#if DIAMOND_GRID == 1
2284
S32 hme_subpel_refine_search_node_high_speed(
2285
    search_node_t *ps_search_node,
2286
    hme_subpel_prms_t *ps_prms,
2287
    layer_ctxt_t *ps_curr_layer,
2288
    BLK_SIZE_T e_blk_size,
2289
    S32 x_off,
2290
    S32 y_off,
2291
    search_results_t *ps_search_results,
2292
    S32 pred_lx,
2293
    S32 i4_part_mask,
2294
    S32 *pi4_valid_part_ids,
2295
    S32 search_idx,
2296
    subpel_dedup_enabler_t *ps_dedup_enabler,
2297
    me_func_selector_t *ps_func_selector,
2298
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
2299
3.67M
{
2300
3.67M
    S32 i4_num_hpel_refine, i4_num_qpel_refine;
2301
3.67M
    S32 i4_offset, i4_grid_mask;
2302
3.67M
    S08 i1_ref_idx;
2303
3.67M
    S32 i4_blk_wd, i4_blk_ht;
2304
3.67M
    S32 i4_ref_stride, i4_i;
2305
3.67M
    pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2306
3.67M
    result_upd_prms_t s_result_prms;
2307
3.67M
    search_node_t s_temp_search_node;
2308
2309
    /*************************************************************************/
2310
    /* Tracks current MV with the fractional component.                      */
2311
    /*************************************************************************/
2312
3.67M
    S32 i4_mv_x, i4_mv_y;
2313
3.67M
    S32 i4_frac_x, i4_frac_y;
2314
2315
    /*************************************************************************/
2316
    /* Function pointer for SAD/SATD, array and prms structure to pass to    */
2317
    /* This function                                                         */
2318
    /*************************************************************************/
2319
3.67M
    PF_SAD_RESULT_FXN_T pf_err_compute;
2320
2321
3.67M
    S32 ai4_sad_grid[17], i4_tot_cost;
2322
3.67M
    err_prms_t s_err_prms;
2323
2324
    /*************************************************************************/
2325
    /* Allowed MV RANGE                                                      */
2326
    /*************************************************************************/
2327
3.67M
    range_prms_t *ps_range_prms;
2328
2329
    /*************************************************************************/
2330
    /* stores min id in grid with associated min cost.                       */
2331
    /*************************************************************************/
2332
3.67M
    S32 i4_min_cost, i4_min_sad;
2333
3.67M
    GRID_PT_T e_min_id;
2334
2335
3.67M
    PF_INTERP_FXN_T pf_qpel_interp;
2336
    /*************************************************************************/
2337
    /* For hpel and qpel we move in diamonds and hence each point in the     */
2338
    /* diamond will belong to a completely different plane. To simplify the  */
2339
    /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
2340
    /* hpel planes which are interpolated during recon.                      */
2341
    /*************************************************************************/
2342
3.67M
    U08 *apu1_hpel_ref[4], *pu1_ref;
2343
2344
3.67M
    interp_prms_t s_interp_prms;
2345
2346
    /*************************************************************************/
2347
    /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2348
    /* points to the corresponding predicted buf with its stride.            */
2349
    /* Note that the pointer cannot be derived just from the id, since the   */
2350
    /* pointer may also point to the hpel buffer (in case we request interp  */
2351
    /* of a hpel pt, which already exists in the recon hpel planes)          */
2352
    /*************************************************************************/
2353
3.67M
    U08 *pu1_final_out;
2354
3.67M
    S32 i4_final_out_stride;
2355
3.67M
    S32 part_id;
2356
3.67M
    S32 check_for_duplicate = 0;
2357
2358
3.67M
    subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
2359
2360
3.67M
    S32 mvx_qpel;
2361
3.67M
    S32 mvy_qpel;
2362
2363
3.67M
    pf_err_compute = hme_get_calc_sad_and_result_subpel_fxn(
2364
3.67M
        ps_func_selector,
2365
3.67M
        ps_me_optimised_function_list,
2366
3.67M
        i4_part_mask,
2367
3.67M
        ps_prms->i4_use_satd,
2368
3.67M
        ps_subpel_refine_ctxt->i4_num_valid_parts,
2369
3.67M
        ps_search_results->u1_num_results_per_part);
2370
2371
3.67M
    i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2372
3.67M
    i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2373
2374
    /* Prediction contet should now deal with qpel units */
2375
3.67M
    HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2376
2377
    /* Buffer allocation for subpel */
2378
    /* Current design is that there may be many partitions and different mvs */
2379
    /* that attempt subpel refinemnt. While there is possibility of overlap, the */
2380
    /* hashing to detect and avoid overlap may be very complex. So, currently,   */
2381
    /* the only thing done is to store the eventual predicted buffer with every  */
2382
    /* ctb node that holds the result of hte best subpel search */
2383
2384
    /* Compute the base pointer for input, interpolated buffers */
2385
    /* The base pointers point as follows: */
2386
    /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
2387
    /* To these, we need to add the offset of the current node */
2388
3.67M
    i4_ref_stride = ps_curr_layer->i4_rec_stride;
2389
3.67M
    i4_offset = x_off + (y_off * i4_ref_stride);
2390
3.67M
    i1_ref_idx = ps_search_node->i1_ref_idx;
2391
2392
3.67M
    apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
2393
3.67M
    apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
2394
3.67M
    apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
2395
3.67M
    apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
2396
2397
    /* Initialize result params used for partition update */
2398
3.67M
    s_result_prms.pf_mv_cost_compute = NULL;
2399
3.67M
    s_result_prms.ps_search_results = ps_search_results;
2400
3.67M
    s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
2401
3.67M
    s_result_prms.i1_ref_idx = ps_search_node->i1_ref_idx;
2402
3.67M
    s_result_prms.u1_pred_lx = search_idx;
2403
3.67M
    s_result_prms.i4_part_mask = i4_part_mask;
2404
3.67M
    s_result_prms.ps_search_node_base = ps_search_node;
2405
3.67M
    s_result_prms.pi4_sad_grid = &ai4_sad_grid[0];
2406
3.67M
    s_result_prms.i4_grid_mask = 1;
2407
3.67M
    s_result_prms.ps_search_node = &s_temp_search_node;
2408
3.67M
    s_temp_search_node.i1_ref_idx = ps_search_node->i1_ref_idx;
2409
2410
    /* convert to hpel units */
2411
3.67M
    i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
2412
3.67M
    i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
2413
2414
    /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
2415
3.67M
    ps_range_prms = ps_prms->aps_mv_range_qpel[i1_ref_idx];
2416
3.67M
    i4_grid_mask = (GRID_DIAMOND_ENABLE_ALL);
2417
3.67M
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2418
2419
3.67M
    i4_min_cost = MAX_32BIT_VAL;
2420
3.67M
    i4_min_sad = MAX_32BIT_VAL;
2421
2422
    /*************************************************************************/
2423
    /* Prepare the input params to SAD/SATD function. Note that input is     */
2424
    /* passed from the calling funcion since it may be I (normal subpel      */
2425
    /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
2426
    /* Both cases are handled here.                                          */
2427
    /*************************************************************************/
2428
3.67M
    s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
2429
3.67M
    s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
2430
3.67M
    s_err_prms.i4_ref_stride = i4_ref_stride;
2431
3.67M
    s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
2432
3.67M
    s_err_prms.i4_grid_mask = 1;
2433
3.67M
    s_err_prms.pi4_sad_grid = &ai4_sad_grid[0];
2434
3.67M
    s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
2435
3.67M
    s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
2436
2437
3.67M
    s_result_prms.ps_subpel_refine_ctxt = ps_subpel_refine_ctxt;
2438
2439
3.67M
    part_id = ps_search_node->u1_part_id;
2440
4.23M
    for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
2441
4.04M
    {
2442
4.04M
        e_min_id = PT_C;
2443
2444
4.04M
        mvx_qpel = i4_mv_x << 1;
2445
4.04M
        mvy_qpel = i4_mv_y << 1;
2446
2447
        /* Central pt */
2448
4.04M
        if(i4_grid_mask & BIT_EN(PT_C))
2449
3.67M
        {
2450
            //ps_search_node->i2_mv_x = (S16)i4_mv_x;
2451
            //ps_search_node->i2_mv_x = (S16)i4_mv_y;
2452
            /* central pt is i4_mv_x, i4_mv_y */
2453
3.67M
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2454
3.67M
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
2455
2456
3.67M
            i4_frac_x = i4_mv_x & 1;
2457
3.67M
            i4_frac_y = i4_mv_y & 1;
2458
3.67M
            pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2459
3.67M
            s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2460
2461
            /* Update the mv's with the current candt motion vectors */
2462
3.67M
            s_result_prms.i2_mv_x = mvx_qpel;
2463
3.67M
            s_result_prms.i2_mv_y = mvy_qpel;
2464
3.67M
            s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2465
3.67M
            s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2466
2467
3.67M
            pf_err_compute(&s_err_prms, &s_result_prms);
2468
2469
3.67M
            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2470
3.67M
            if(i4_tot_cost < i4_min_cost)
2471
3.67M
            {
2472
3.67M
                i4_min_cost = i4_tot_cost;
2473
3.67M
                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2474
3.67M
                e_min_id = PT_C;
2475
3.67M
                pu1_final_out = s_err_prms.pu1_ref;
2476
3.67M
            }
2477
3.67M
        }
2478
2479
        /* left pt */
2480
4.04M
        if(i4_grid_mask & BIT_EN(PT_L))
2481
3.94M
        {
2482
3.94M
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2483
3.94M
                ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
2484
2485
3.94M
            if(!check_for_duplicate)
2486
3.90M
            {
2487
                /* search node mv is stored in qpel units */
2488
3.90M
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
2489
3.90M
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2490
                /* central pt is i4_mv_x - 1, i4_mv_y */
2491
3.90M
                i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
2492
3.90M
                i4_frac_y = i4_mv_y & 1;
2493
3.90M
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2494
3.90M
                s_err_prms.pu1_ref =
2495
3.90M
                    pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2496
2497
                /* Update the mv's with the current candt motion vectors */
2498
3.90M
                s_result_prms.i2_mv_x = mvx_qpel - 2;
2499
3.90M
                s_result_prms.i2_mv_y = mvy_qpel;
2500
3.90M
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 2;
2501
3.90M
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2502
2503
3.90M
                pf_err_compute(&s_err_prms, &s_result_prms);
2504
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2505
3.90M
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2506
3.90M
                if(i4_tot_cost < i4_min_cost)
2507
199k
                {
2508
199k
                    i4_min_cost = i4_tot_cost;
2509
199k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2510
199k
                    e_min_id = PT_L;
2511
199k
                    pu1_final_out = s_err_prms.pu1_ref;
2512
199k
                }
2513
3.90M
            }
2514
3.94M
        }
2515
        /* top pt */
2516
4.04M
        if(i4_grid_mask & BIT_EN(PT_T))
2517
3.94M
        {
2518
3.94M
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2519
3.94M
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
2520
2521
3.94M
            if(!check_for_duplicate)
2522
3.90M
            {
2523
                /* search node mv is stored in qpel units */
2524
3.90M
                ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
2525
3.90M
                ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
2526
                /* top pt is i4_mv_x, i4_mv_y - 1 */
2527
3.90M
                i4_frac_x = i4_mv_x & 1;
2528
3.90M
                i4_frac_y = (i4_mv_y - 1) & 1;
2529
3.90M
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2530
3.90M
                s_err_prms.pu1_ref =
2531
3.90M
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
2532
2533
                /* Update the mv's with the current candt motion vectors */
2534
3.90M
                s_result_prms.i2_mv_x = mvx_qpel;
2535
3.90M
                s_result_prms.i2_mv_y = mvy_qpel - 2;
2536
3.90M
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2537
3.90M
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 2;
2538
2539
3.90M
                pf_err_compute(&s_err_prms, &s_result_prms);
2540
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2541
3.90M
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2542
3.90M
                if(i4_tot_cost < i4_min_cost)
2543
158k
                {
2544
158k
                    i4_min_cost = i4_tot_cost;
2545
158k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2546
158k
                    e_min_id = PT_T;
2547
158k
                    pu1_final_out = s_err_prms.pu1_ref;
2548
158k
                }
2549
3.90M
            }
2550
3.94M
        }
2551
        /* right pt */
2552
4.04M
        if(i4_grid_mask & BIT_EN(PT_R))
2553
3.95M
        {
2554
3.95M
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2555
3.95M
                ps_dedup_enabler, num_unique_nodes, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
2556
3.95M
            if(!check_for_duplicate)
2557
3.91M
            {
2558
                /* search node mv is stored in qpel units */
2559
3.91M
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
2560
3.91M
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2561
                /* right pt is i4_mv_x + 1, i4_mv_y */
2562
3.91M
                i4_frac_x = (i4_mv_x + 1) & 1;
2563
3.91M
                i4_frac_y = i4_mv_y & 1;
2564
2565
3.91M
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2566
3.91M
                s_err_prms.pu1_ref =
2567
3.91M
                    pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2568
2569
                /* Update the mv's with the current candt motion vectors */
2570
3.91M
                s_result_prms.i2_mv_x = mvx_qpel + 2;
2571
3.91M
                s_result_prms.i2_mv_y = mvy_qpel;
2572
3.91M
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 2;
2573
3.91M
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2574
2575
3.91M
                pf_err_compute(&s_err_prms, &s_result_prms);
2576
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2577
3.91M
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2578
3.91M
                if(i4_tot_cost < i4_min_cost)
2579
204k
                {
2580
204k
                    i4_min_cost = i4_tot_cost;
2581
204k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2582
204k
                    e_min_id = PT_R;
2583
204k
                    pu1_final_out = s_err_prms.pu1_ref;
2584
204k
                }
2585
3.91M
            }
2586
3.95M
        }
2587
        /* bottom pt */
2588
4.04M
        if(i4_grid_mask & BIT_EN(PT_B))
2589
3.96M
        {
2590
3.96M
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2591
3.96M
                ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
2592
3.96M
            if(!check_for_duplicate)
2593
3.92M
            {
2594
                /* search node mv is stored in qpel units */
2595
3.92M
                ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
2596
3.92M
                ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
2597
3.92M
                i4_frac_x = i4_mv_x & 1;
2598
3.92M
                i4_frac_y = (i4_mv_y + 1) & 1;
2599
3.92M
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2600
3.92M
                s_err_prms.pu1_ref =
2601
3.92M
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
2602
2603
                /* Update the mv's with the current candt motion vectors */
2604
3.92M
                s_result_prms.i2_mv_x = mvx_qpel;
2605
3.92M
                s_result_prms.i2_mv_y = mvy_qpel + 2;
2606
3.92M
                s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2607
3.92M
                s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 2;
2608
2609
3.92M
                pf_err_compute(&s_err_prms, &s_result_prms);
2610
                //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2611
3.92M
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2612
3.92M
                if(i4_tot_cost < i4_min_cost)
2613
160k
                {
2614
160k
                    i4_min_cost = i4_tot_cost;
2615
160k
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2616
160k
                    e_min_id = PT_B;
2617
160k
                    pu1_final_out = s_err_prms.pu1_ref;
2618
160k
                }
2619
3.92M
            }
2620
3.96M
        }
2621
        /* Early exit in case of central point */
2622
4.04M
        if(e_min_id == PT_C)
2623
3.48M
            break;
2624
2625
        /*********************************************************************/
2626
        /* Depending on the best result location, we may be able to skip     */
2627
        /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
2628
        /* the best result, the next iteration need not do centre, left pts  */
2629
        /*********************************************************************/
2630
566k
        i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2631
566k
        i4_mv_x += gai1_grid_id_to_x[e_min_id];
2632
566k
        i4_mv_y += gai1_grid_id_to_y[e_min_id];
2633
566k
        ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2634
566k
        ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2635
566k
        i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2636
566k
    }
2637
2638
    /* Convert to QPEL units */
2639
3.67M
    i4_mv_x <<= 1;
2640
3.67M
    i4_mv_y <<= 1;
2641
2642
3.67M
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2643
3.67M
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2644
2645
    /* Exact interpolation or averaging chosen here */
2646
3.67M
    pf_qpel_interp = ps_prms->pf_qpel_interp;
2647
2648
    /* Next QPEL ME */
2649
    /* In this case, we have option of doing exact QPEL interpolation or avg */
2650
    /*************************************************************************/
2651
    /*        x                                                              */
2652
    /*    A b C d                                                            */
2653
    /*    e f g h                                                            */
2654
    /*    I j K l                                                            */
2655
    /*    m n o p                                                            */
2656
    /*    Q r S t                                                            */
2657
    /*                                                                       */
2658
    /*    Approximate QPEL logic                                             */
2659
    /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
2660
    /*    for any given pt, we can get all the information required about    */
2661
    /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
2662
    /*     surrounding pts info:                                             */
2663
    /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
2664
    /*           buffer 2: hxfy, offsets for both are 0, 0                   */
2665
    /*    similarly for other pts the info can be gotten                     */
2666
    /*************************************************************************/
2667
3.67M
    i4_grid_mask = GRID_DIAMOND_ENABLE_ALL ^ (BIT_EN(PT_C));
2668
3.67M
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2669
2670
    /*************************************************************************/
2671
    /* One time preparation of non changing interpolation params. These      */
2672
    /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
2673
    /* working memory (not used though in case of averaging).                */
2674
    /*************************************************************************/
2675
3.67M
    s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
2676
3.67M
    s_interp_prms.i4_ref_stride = i4_ref_stride;
2677
3.67M
    s_interp_prms.i4_blk_wd = i4_blk_wd;
2678
3.67M
    s_interp_prms.i4_blk_ht = i4_blk_ht;
2679
2680
3.67M
    i4_final_out_stride = i4_ref_stride;
2681
2682
3.67M
    {
2683
3.67M
        U08 *pu1_mem;
2684
        /*********************************************************************/
2685
        /* Allocation of working memory for interpolated buffers. We maintain*/
2686
        /* an intermediate working buffer, and 2 ping pong interpolated out  */
2687
        /* buffers, purpose of ping pong explained later below               */
2688
        /*********************************************************************/
2689
3.67M
        pu1_mem = ps_prms->pu1_wkg_mem;
2690
3.67M
        s_interp_prms.pu1_wkg_mem = pu1_mem;
2691
2692
        //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
2693
3.67M
        s_interp_prms.apu1_interp_out[0] = pu1_mem;
2694
2695
3.67M
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2696
3.67M
        s_interp_prms.apu1_interp_out[1] = pu1_mem;
2697
2698
3.67M
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2699
3.67M
        s_interp_prms.apu1_interp_out[2] = pu1_mem;
2700
2701
3.67M
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2702
3.67M
        s_interp_prms.apu1_interp_out[3] = pu1_mem;
2703
2704
3.67M
        pu1_mem += (INTERP_OUT_BUF_SIZE);
2705
3.67M
        s_interp_prms.apu1_interp_out[4] = pu1_mem;
2706
2707
        /*********************************************************************/
2708
        /* Stride of interpolated output is just a function of blk width of  */
2709
        /* this partition and hence remains constant for this partition      */
2710
        /*********************************************************************/
2711
3.67M
        s_interp_prms.i4_out_stride = (i4_blk_wd);
2712
3.67M
    }
2713
2714
3.67M
    {
2715
3.67M
        UWORD8 *apu1_final[4];
2716
3.67M
        WORD32 ai4_ref_stride[4];
2717
        /*************************************************************************/
2718
        /* Ping pong design for interpolated buffers. We use a min id, which     */
2719
        /* tracks the id of the ppu1_interp_out that stores the best result.     */
2720
        /* When new interp to be done, it uses 1 - bes result id to do the interp*/
2721
        /* min id is toggled when any new result becomes the best result.        */
2722
        /*************************************************************************/
2723
2724
4.04M
        for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
2725
3.77M
        {
2726
3.77M
            e_min_id = PT_C;
2727
2728
3.77M
            mvx_qpel = i4_mv_x;
2729
3.77M
            mvy_qpel = i4_mv_y;
2730
3.77M
            hme_qpel_interp_comprehensive(
2731
3.77M
                &s_interp_prms,
2732
3.77M
                apu1_final,
2733
3.77M
                ai4_ref_stride,
2734
3.77M
                i4_mv_x,
2735
3.77M
                i4_mv_y,
2736
3.77M
                i4_grid_mask,
2737
3.77M
                ps_me_optimised_function_list);
2738
3.77M
            if(i4_grid_mask & BIT_EN(PT_L))
2739
3.70M
            {
2740
3.70M
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2741
3.70M
                    ps_dedup_enabler,
2742
3.70M
                    num_unique_nodes,
2743
3.70M
                    mvx_qpel - 1,
2744
3.70M
                    mvy_qpel - 0,
2745
3.70M
                    check_for_duplicate);
2746
2747
3.70M
                if(!check_for_duplicate)
2748
3.65M
                {
2749
3.65M
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
2750
3.65M
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2751
2752
3.65M
                    s_err_prms.pu1_ref = apu1_final[0];
2753
3.65M
                    s_err_prms.i4_ref_stride = ai4_ref_stride[0];
2754
2755
                    /* Update the mv's with the current candt motion vectors */
2756
3.65M
                    s_result_prms.i2_mv_x = mvx_qpel - 1;
2757
3.65M
                    s_result_prms.i2_mv_y = mvy_qpel;
2758
3.65M
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 1;
2759
3.65M
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2760
2761
3.65M
                    pf_err_compute(&s_err_prms, &s_result_prms);
2762
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2763
2764
3.65M
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2765
3.65M
                    if(i4_tot_cost < i4_min_cost)
2766
140k
                    {
2767
140k
                        e_min_id = PT_L;
2768
140k
                        i4_min_cost = i4_tot_cost;
2769
140k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2770
140k
                    }
2771
3.65M
                }
2772
3.70M
            }
2773
3.77M
            if(i4_grid_mask & BIT_EN(PT_T))
2774
3.70M
            {
2775
3.70M
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2776
3.70M
                    ps_dedup_enabler,
2777
3.70M
                    num_unique_nodes,
2778
3.70M
                    mvx_qpel - 0,
2779
3.70M
                    mvy_qpel - 1,
2780
3.70M
                    check_for_duplicate);
2781
2782
3.70M
                if(!check_for_duplicate)
2783
3.64M
                {
2784
3.64M
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2785
3.64M
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
2786
2787
3.64M
                    s_err_prms.pu1_ref = apu1_final[1];
2788
3.64M
                    s_err_prms.i4_ref_stride = ai4_ref_stride[1];
2789
2790
                    /* Update the mv's with the current candt motion vectors */
2791
3.64M
                    s_result_prms.i2_mv_x = mvx_qpel;
2792
3.64M
                    s_result_prms.i2_mv_y = mvy_qpel - 1;
2793
2794
3.64M
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2795
3.64M
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 1;
2796
2797
3.64M
                    pf_err_compute(&s_err_prms, &s_result_prms);
2798
2799
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2800
3.64M
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2801
3.64M
                    if(i4_tot_cost < i4_min_cost)
2802
112k
                    {
2803
112k
                        e_min_id = PT_T;
2804
112k
                        i4_min_cost = i4_tot_cost;
2805
112k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2806
112k
                    }
2807
3.64M
                }
2808
3.70M
            }
2809
3.77M
            if(i4_grid_mask & BIT_EN(PT_R))
2810
3.70M
            {
2811
3.70M
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2812
3.70M
                    ps_dedup_enabler, num_unique_nodes, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
2813
2814
3.70M
                if(!check_for_duplicate)
2815
3.65M
                {
2816
3.65M
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
2817
3.65M
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2818
2819
3.65M
                    s_err_prms.pu1_ref = apu1_final[2];
2820
3.65M
                    s_err_prms.i4_ref_stride = ai4_ref_stride[2];
2821
2822
                    /* Update the mv's with the current candt motion vectors */
2823
3.65M
                    s_result_prms.i2_mv_x = mvx_qpel + 1;
2824
3.65M
                    s_result_prms.i2_mv_y = mvy_qpel;
2825
2826
3.65M
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 1;
2827
3.65M
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2828
2829
3.65M
                    pf_err_compute(&s_err_prms, &s_result_prms);
2830
2831
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2832
2833
3.65M
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2834
3.65M
                    if(i4_tot_cost < i4_min_cost)
2835
117k
                    {
2836
117k
                        e_min_id = PT_R;
2837
117k
                        i4_min_cost = i4_tot_cost;
2838
117k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2839
117k
                    }
2840
3.65M
                }
2841
3.70M
            }
2842
            /* i4_mv_x and i4_mv_y will always be the centre pt */
2843
            /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
2844
3.77M
            if(i4_grid_mask & BIT_EN(PT_B))
2845
3.70M
            {
2846
3.70M
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2847
3.70M
                    ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
2848
2849
3.70M
                if(!check_for_duplicate)
2850
3.65M
                {
2851
3.65M
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2852
3.65M
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
2853
2854
3.65M
                    s_err_prms.pu1_ref = apu1_final[3];
2855
3.65M
                    s_err_prms.i4_ref_stride = ai4_ref_stride[3];
2856
2857
                    /* Update the mv's with the current candt motion vectors */
2858
3.65M
                    s_result_prms.i2_mv_x = mvx_qpel;
2859
3.65M
                    s_result_prms.i2_mv_y = mvy_qpel + 1;
2860
2861
3.65M
                    s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2862
3.65M
                    s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 1;
2863
2864
3.65M
                    pf_err_compute(&s_err_prms, &s_result_prms);
2865
2866
                    //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2867
3.65M
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2868
3.65M
                    if(i4_tot_cost < i4_min_cost)
2869
96.8k
                    {
2870
96.8k
                        e_min_id = PT_B;
2871
96.8k
                        i4_min_cost = i4_tot_cost;
2872
96.8k
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2873
96.8k
                    }
2874
3.65M
                }
2875
3.70M
            }
2876
2877
            /* New QPEL mv x and y */
2878
3.77M
            if(e_min_id == PT_C)
2879
3.40M
                break;
2880
374k
            i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2881
374k
            i4_mv_x += gai1_grid_id_to_x[e_min_id];
2882
374k
            i4_mv_y += gai1_grid_id_to_y[e_min_id];
2883
374k
            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2884
374k
            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2885
374k
            i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2886
374k
        }
2887
3.67M
    }
2888
2889
    /* update modified motion vectors and cost at end of subpel */
2890
3.67M
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2891
3.67M
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2892
3.67M
    ps_search_node->i4_tot_cost = i4_min_cost;
2893
3.67M
    ps_search_node->i4_sad = i4_min_sad;
2894
2895
    /********************************************************************************/
2896
    /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
2897
    /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
2898
    /********************************************************************************/
2899
    //ps_pred_ctxt->lambda >>= 1;
2900
2901
3.67M
    return (i4_min_cost);
2902
3.67M
}
2903
#elif DIAMOND_GRID == 0
2904
S32 hme_subpel_refine_search_node_high_speed(
2905
    search_node_t *ps_search_node,
2906
    hme_subpel_prms_t *ps_prms,
2907
    layer_ctxt_t *ps_curr_layer,
2908
    BLK_SIZE_T e_blk_size,
2909
    S32 x_off,
2910
    S32 y_off,
2911
    search_results_t *ps_search_results,
2912
    S32 pred_lx,
2913
    S32 i4_part_mask,
2914
    S32 *pi4_valid_part_ids,
2915
    S32 search_idx,
2916
    subpel_dedup_enabler_t *ps_dedup_enabler,
2917
    me_func_selector_t *ps_func_selector)
2918
{
2919
    S32 i4_num_hpel_refine, i4_num_qpel_refine;
2920
    S32 i4_offset, i4_grid_mask;
2921
    S08 i1_ref_idx;
2922
    S32 i4_blk_wd, i4_blk_ht;
2923
    S32 i4_ref_stride, i4_i;
2924
    pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2925
    result_upd_prms_t s_result_prms;
2926
2927
    /*************************************************************************/
2928
    /* Tracks current MV with the fractional component.                      */
2929
    /*************************************************************************/
2930
    S32 i4_mv_x, i4_mv_y;
2931
    S32 i4_frac_x, i4_frac_y;
2932
2933
    /*************************************************************************/
2934
    /* Function pointer for SAD/SATD, array and prms structure to pass to    */
2935
    /* This function                                                         */
2936
    /*************************************************************************/
2937
    PF_SAD_FXN_T pf_err_compute;
2938
    S32 ai4_sad_grid[9][17], i4_tot_cost;
2939
    err_prms_t s_err_prms;
2940
2941
    /*************************************************************************/
2942
    /* Allowed MV RANGE                                                      */
2943
    /*************************************************************************/
2944
    range_prms_t *ps_range_prms;
2945
2946
    /*************************************************************************/
2947
    /* stores min id in grid with associated min cost.                       */
2948
    /*************************************************************************/
2949
    S32 i4_min_cost, i4_min_sad;
2950
    GRID_PT_T e_min_id;
2951
2952
    PF_INTERP_FXN_T pf_qpel_interp;
2953
    /*************************************************************************/
2954
    /* For hpel and qpel we move in diamonds and hence each point in the     */
2955
    /* diamond will belong to a completely different plane. To simplify the  */
2956
    /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
2957
    /* hpel planes which are interpolated during recon.                      */
2958
    /*************************************************************************/
2959
    U08 *apu1_hpel_ref[4], *pu1_ref;
2960
2961
    interp_prms_t s_interp_prms;
2962
2963
    /*************************************************************************/
2964
    /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2965
    /* points to the corresponding predicted buf with its stride.            */
2966
    /* Note that the pointer cannot be derived just from the id, since the   */
2967
    /* pointer may also point to the hpel buffer (in case we request interp  */
2968
    /* of a hpel pt, which already exists in the recon hpel planes)          */
2969
    /*************************************************************************/
2970
    U08 *pu1_final_out;
2971
    S32 i4_final_out_stride;
2972
    S32 part_id;
2973
    S32 check_for_duplicate = 0;
2974
2975
    S32 mvx_qpel;
2976
    S32 mvy_qpel;
2977
2978
    /*************************************************************************/
2979
    /* Appropriate Err compute fxn, depends on SAD/SATD, blk size and remains*/
2980
    /* fixed through this subpel refinement for this partition.              */
2981
    /* Note, we do not enable grid sads since each pt is different buffers.  */
2982
    /* Hence, part mask is also nearly dont care and we use 2Nx2N enabled.   */
2983
    /*************************************************************************/
2984
    if(ps_prms->i4_use_satd)
2985
    {
2986
        pf_err_compute = hme_evalsatd_update_1_best_result_pt_pu_16x16;
2987
    }
2988
    else
2989
    {
2990
        pf_err_compute = hme_evalsad_grid_pu_16x16; /* hme_evalsad_pt_pu_16x16; */
2991
    }
2992
2993
    i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2994
    i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2995
2996
    /* Prediction contet should now deal with qpel units */
2997
    HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2998
2999
    /* Buffer allocation for subpel */
3000
    /* Current design is that there may be many partitions and different mvs */
3001
    /* that attempt subpel refinemnt. While there is possibility of overlap, the */
3002
    /* hashing to detect and avoid overlap may be very complex. So, currently,   */
3003
    /* the only thing done is to store the eventual predicted buffer with every  */
3004
    /* ctb node that holds the result of hte best subpel search */
3005
3006
    /* Compute the base pointer for input, interpolated buffers */
3007
    /* The base pointers point as follows:
3008
    /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
3009
    /* To these, we need to add the offset of the current node */
3010
    i4_ref_stride = ps_curr_layer->i4_rec_stride;
3011
    i4_offset = x_off + (y_off * i4_ref_stride);
3012
    i1_ref_idx = ps_search_node->i1_ref_idx;
3013
3014
    apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
3015
    apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
3016
    apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
3017
    apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
3018
3019
    /* Initialize result params used for partition update */
3020
    s_result_prms.pf_mv_cost_compute = NULL;
3021
    s_result_prms.ps_search_results = ps_search_results;
3022
    s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
3023
    s_result_prms.i1_ref_idx = search_idx;
3024
    s_result_prms.i4_part_mask = i4_part_mask;
3025
    s_result_prms.ps_search_node_base = ps_search_node;
3026
    s_result_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3027
    s_result_prms.i4_grid_mask = 1;
3028
3029
    /* convert to hpel units */
3030
    i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
3031
    i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
3032
3033
    /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
3034
    ps_range_prms = ps_prms->ps_mv_range_qpel;
3035
    i4_grid_mask = (GRID_ALL_PTS_VALID);
3036
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3037
3038
    i4_min_cost = MAX_32BIT_VAL;
3039
    i4_min_sad = MAX_32BIT_VAL;
3040
3041
    /*************************************************************************/
3042
    /* Prepare the input params to SAD/SATD function. Note that input is     */
3043
    /* passed from the calling funcion since it may be I (normal subpel      */
3044
    /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
3045
    /* Both cases are handled here.                                          */
3046
    /*************************************************************************/
3047
    s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
3048
    s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
3049
    s_err_prms.i4_ref_stride = i4_ref_stride;
3050
    s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
3051
    s_err_prms.i4_grid_mask = 1;
3052
    s_err_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3053
    s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
3054
    s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
3055
3056
    /* TODO: Currently doubling lambda for Hadamard Sad instead of 1.9*sadlambda */
3057
    //ps_pred_ctxt->lambda <<= 1;
3058
    part_id = ps_search_node->u1_part_id;
3059
    for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
3060
    {
3061
        e_min_id = PT_C;
3062
3063
        mvx_qpel = i4_mv_x << 1;
3064
        mvy_qpel = i4_mv_y << 1;
3065
3066
        /* Central pt */
3067
        if(i4_grid_mask & BIT_EN(PT_C))
3068
        {
3069
            //ps_search_node->i2_mv_x = (S16)i4_mv_x;
3070
            //ps_search_node->i2_mv_x = (S16)i4_mv_y;
3071
            /* central pt is i4_mv_x, i4_mv_y */
3072
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3073
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
3074
3075
            i4_frac_x = i4_mv_x & 1;
3076
            i4_frac_y = i4_mv_y & 1;
3077
            pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3078
            s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3079
            pf_err_compute(&s_err_prms);
3080
            /* Update the mv's with the current candt motion vectors */
3081
            s_result_prms.i2_mv_x = mvx_qpel;
3082
            s_result_prms.i2_mv_y = mvy_qpel;
3083
            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3084
            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3085
            if(i4_tot_cost < i4_min_cost)
3086
            {
3087
                i4_min_cost = i4_tot_cost;
3088
                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3089
                e_min_id = PT_C;
3090
                pu1_final_out = s_err_prms.pu1_ref;
3091
            }
3092
        }
3093
3094
        /* left pt */
3095
        if(i4_grid_mask & BIT_EN(PT_L))
3096
        {
3097
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3098
                ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
3099
3100
            if(!check_for_duplicate)
3101
            {
3102
                /* search node mv is stored in qpel units */
3103
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
3104
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3105
                /* central pt is i4_mv_x - 1, i4_mv_y */
3106
                i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
3107
                i4_frac_y = i4_mv_y & 1;
3108
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3109
                s_err_prms.pu1_ref =
3110
                    pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3111
3112
                pf_err_compute(&s_err_prms);
3113
                /* Update the mv's with the current candt motion vectors */
3114
                s_result_prms.i2_mv_x = mvx_qpel;
3115
                s_result_prms.i2_mv_y = mvy_qpel;
3116
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3117
3118
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3119
3120
                if(i4_tot_cost < i4_min_cost)
3121
                {
3122
                    i4_min_cost = i4_tot_cost;
3123
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3124
                    e_min_id = PT_L;
3125
                    pu1_final_out = s_err_prms.pu1_ref;
3126
                }
3127
            }
3128
        }
3129
        /* top pt */
3130
        if(i4_grid_mask & BIT_EN(PT_T))
3131
        {
3132
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3133
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
3134
3135
            if(!check_for_duplicate)
3136
            {
3137
                /* search node mv is stored in qpel units */
3138
                ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3139
                ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
3140
                /* top pt is i4_mv_x, i4_mv_y - 1 */
3141
                i4_frac_x = i4_mv_x & 1;
3142
                i4_frac_y = (i4_mv_y - 1) & 1;
3143
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3144
                s_err_prms.pu1_ref =
3145
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
3146
                pf_err_compute(&s_err_prms);
3147
                /* Update the mv's with the current candt motion vectors */
3148
                s_result_prms.i2_mv_x = mvx_qpel;
3149
                s_result_prms.i2_mv_y = mvy_qpel - 2;
3150
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3151
3152
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3153
3154
                if(i4_tot_cost < i4_min_cost)
3155
                {
3156
                    i4_min_cost = i4_tot_cost;
3157
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3158
                    e_min_id = PT_T;
3159
                    pu1_final_out = s_err_prms.pu1_ref;
3160
                }
3161
            }
3162
        }
3163
        /* right pt */
3164
        if(i4_grid_mask & BIT_EN(PT_R))
3165
        {
3166
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3167
                ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
3168
3169
            if(!check_for_duplicate)
3170
            {
3171
                /* search node mv is stored in qpel units */
3172
                ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
3173
                ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3174
                /* right pt is i4_mv_x + 1, i4_mv_y */
3175
                i4_frac_x = (i4_mv_x + 1) & 1;
3176
                i4_frac_y = i4_mv_y & 1;
3177
3178
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3179
                s_err_prms.pu1_ref =
3180
                    pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3181
                pf_err_compute(&s_err_prms);
3182
                /* Update the mv's with the current candt motion vectors */
3183
                s_result_prms.i2_mv_x = mvx_qpel + 2;
3184
                s_result_prms.i2_mv_y = mvy_qpel;
3185
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3186
3187
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3188
3189
                if(i4_tot_cost < i4_min_cost)
3190
                {
3191
                    i4_min_cost = i4_tot_cost;
3192
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3193
                    e_min_id = PT_R;
3194
                    pu1_final_out = s_err_prms.pu1_ref;
3195
                }
3196
            }
3197
        }
3198
        /* bottom pt */
3199
        if(i4_grid_mask & BIT_EN(PT_B))
3200
        {
3201
            CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3202
                ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
3203
3204
            if(!check_for_duplicate)
3205
            {
3206
                /* search node mv is stored in qpel units */
3207
                ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
3208
                ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
3209
                i4_frac_x = i4_mv_x & 1;
3210
                i4_frac_y = (i4_mv_y + 1) & 1;
3211
                pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3212
                s_err_prms.pu1_ref =
3213
                    pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
3214
3215
                pf_err_compute(&s_err_prms);
3216
                /* Update the mv's with the current candt motion vectors */
3217
                s_result_prms.i2_mv_x = mvx_qpel;
3218
                s_result_prms.i2_mv_y = mvy_qpel + 2;
3219
                hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3220
3221
                i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3222
3223
                if(i4_tot_cost < i4_min_cost)
3224
                {
3225
                    i4_min_cost = i4_tot_cost;
3226
                    i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3227
                    e_min_id = PT_B;
3228
                    pu1_final_out = s_err_prms.pu1_ref;
3229
                }
3230
            }
3231
        }
3232
        if(e_min_id == PT_C)
3233
        {
3234
            if(!i4_i)
3235
            {
3236
                /* TL pt */
3237
                if(i4_grid_mask & BIT_EN(PT_TL))
3238
                {
3239
                    S32 mvx_minus_1 = (i4_mv_x - 1);
3240
                    S32 mvy_minus_1 = (i4_mv_y - 1);
3241
3242
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3243
                        ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel - 2, check_for_duplicate);
3244
3245
                    if(!check_for_duplicate)
3246
                    {
3247
                        /* search node mv is stored in qpel units */
3248
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3249
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3250
                        i4_frac_x = mvx_minus_1 & 1;
3251
                        i4_frac_y = mvy_minus_1 & 1;
3252
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3253
                        s_err_prms.pu1_ref =
3254
                            pu1_ref + (mvx_minus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3255
3256
                        pf_err_compute(&s_err_prms);
3257
                        /* Update the mv's with the current candt motion vectors */
3258
                        s_result_prms.i2_mv_x = mvx_qpel - 2;
3259
                        s_result_prms.i2_mv_y = mvy_qpel - 2;
3260
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3261
3262
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3263
3264
                        if(i4_tot_cost < i4_min_cost)
3265
                        {
3266
                            i4_min_cost = i4_tot_cost;
3267
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3268
                            e_min_id = PT_TL;
3269
                            pu1_final_out = s_err_prms.pu1_ref;
3270
                        }
3271
                    }
3272
                }
3273
                /* TR pt */
3274
                if(i4_grid_mask & BIT_EN(PT_TR))
3275
                {
3276
                    S32 mvx_plus_1 = (i4_mv_x + 1);
3277
                    S32 mvy_minus_1 = (i4_mv_y - 1);
3278
3279
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3280
                        ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel - 2, check_for_duplicate);
3281
3282
                    if(!check_for_duplicate)
3283
                    {
3284
                        /* search node mv is stored in qpel units */
3285
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3286
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3287
                        i4_frac_x = mvx_plus_1 & 1;
3288
                        i4_frac_y = mvy_minus_1 & 1;
3289
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3290
                        s_err_prms.pu1_ref =
3291
                            pu1_ref + (mvx_plus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3292
3293
                        pf_err_compute(&s_err_prms);
3294
                        /* Update the mv's with the current candt motion vectors */
3295
                        s_result_prms.i2_mv_x = mvx_qpel + 2;
3296
                        s_result_prms.i2_mv_y = mvy_qpel - 2;
3297
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3298
3299
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3300
3301
                        if(i4_tot_cost < i4_min_cost)
3302
                        {
3303
                            i4_min_cost = i4_tot_cost;
3304
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3305
                            e_min_id = PT_TR;
3306
                            pu1_final_out = s_err_prms.pu1_ref;
3307
                        }
3308
                    }
3309
                }
3310
                /* BL pt */
3311
                if(i4_grid_mask & BIT_EN(PT_BL))
3312
                {
3313
                    S32 mvx_minus_1 = (i4_mv_x - 1);
3314
                    S32 mvy_plus_1 = (i4_mv_y + 1);
3315
3316
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3317
                        ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel + 2, check_for_duplicate);
3318
3319
                    if(!check_for_duplicate)
3320
                    {
3321
                        /* search node mv is stored in qpel units */
3322
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3323
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3324
                        i4_frac_x = mvx_minus_1 & 1;
3325
                        i4_frac_y = mvy_plus_1 & 1;
3326
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3327
                        s_err_prms.pu1_ref =
3328
                            pu1_ref + (mvx_minus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3329
3330
                        pf_err_compute(&s_err_prms);
3331
                        /* Update the mv's with the current candt motion vectors */
3332
                        s_result_prms.i2_mv_x = mvx_qpel - 2;
3333
                        s_result_prms.i2_mv_y = mvy_qpel + 2;
3334
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3335
3336
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3337
3338
                        if(i4_tot_cost < i4_min_cost)
3339
                        {
3340
                            i4_min_cost = i4_tot_cost;
3341
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3342
                            e_min_id = PT_BL;
3343
                            pu1_final_out = s_err_prms.pu1_ref;
3344
                        }
3345
                    }
3346
                }
3347
                /* BR pt */
3348
                if(i4_grid_mask & BIT_EN(PT_BR))
3349
                {
3350
                    S32 mvx_plus_1 = (i4_mv_x + 1);
3351
                    S32 mvy_plus_1 = (i4_mv_y + 1);
3352
                    CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3353
                        ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel + 2, check_for_duplicate);
3354
3355
                    if(!check_for_duplicate)
3356
                    {
3357
                        /* search node mv is stored in qpel units */
3358
                        ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3359
                        ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3360
                        i4_frac_x = mvx_plus_1 & 1;
3361
                        i4_frac_y = mvy_plus_1 & 1;
3362
                        pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3363
                        s_err_prms.pu1_ref =
3364
                            pu1_ref + (mvx_plus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3365
3366
                        pf_err_compute(&s_err_prms);
3367
                        /* Update the mv's with the current candt motion vectors */
3368
                        s_result_prms.i2_mv_x = mvx_qpel + 2;
3369
                        s_result_prms.i2_mv_y = mvy_qpel + 2;
3370
                        hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3371
3372
                        i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3373
3374
                        if(i4_tot_cost < i4_min_cost)
3375
                        {
3376
                            i4_min_cost = i4_tot_cost;
3377
                            i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3378
                            e_min_id = PT_BR;
3379
                            pu1_final_out = s_err_prms.pu1_ref;
3380
                        }
3381
                    }
3382
                }
3383
                if(e_min_id == PT_C)
3384
                {
3385
                    break;
3386
                }
3387
            }
3388
            else
3389
            {
3390
                break;
3391
            }
3392
        }
3393
3394
        /*********************************************************************/
3395
        /* Depending on the best result location, we may be able to skip     */
3396
        /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
3397
        /* the best result, the next iteration need not do centre, left pts  */
3398
        /*********************************************************************/
3399
        if(i4_i)
3400
        {
3401
            i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3402
        }
3403
        else
3404
        {
3405
            i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3406
        }
3407
        i4_mv_x += gai1_grid_id_to_x[e_min_id];
3408
        i4_mv_y += gai1_grid_id_to_y[e_min_id];
3409
        ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3410
        ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3411
        i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3412
    }
3413
3414
    /* Convert to QPEL units */
3415
    i4_mv_x <<= 1;
3416
    i4_mv_y <<= 1;
3417
3418
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3419
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3420
3421
    /* Early exit if this partition is visiting same hpel mv again */
3422
    /* Assumption : Checkin for early exit in best result of partition */
3423
    if((ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x ==
3424
        ps_search_node->s_mv.i2_mvx) &&
3425
       (ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y ==
3426
        ps_search_node->s_mv.i2_mvy))
3427
    {
3428
        return (ps_search_results->aps_part_results[search_idx][part_id][0].i4_tot_cost);
3429
    }
3430
    else
3431
    {
3432
        /* Store the best hpel mv for future early exit checks */
3433
        ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x =
3434
            (S16)i4_mv_x;
3435
        ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y =
3436
            (S16)i4_mv_y;
3437
    }
3438
3439
    /* Early exit if this partition is visiting same hpel mv again */
3440
    /* Assumption : Checkin for early exit in second best result of partition */
3441
    if((ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x ==
3442
        ps_search_node->s_mv.i2_mvx) &&
3443
       (ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y ==
3444
        ps_search_node->s_mv.i2_mvy))
3445
    {
3446
        return (ps_search_results->aps_part_results[search_idx][part_id][1].i4_tot_cost);
3447
    }
3448
    else
3449
    {
3450
        /* Store the best hpel mv for future early exit checks */
3451
        ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x =
3452
            (S16)i4_mv_x;
3453
        ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y =
3454
            (S16)i4_mv_y;
3455
    }
3456
3457
    /* Exact interpolation or averaging chosen here */
3458
    pf_qpel_interp = ps_prms->pf_qpel_interp;
3459
3460
    /* Next QPEL ME */
3461
    /* In this case, we have option of doing exact QPEL interpolation or avg */
3462
    /*************************************************************************/
3463
    /*        x                                                              */
3464
    /*    A b C d                                                            */
3465
    /*    e f g h                                                            */
3466
    /*    I j K l                                                            */
3467
    /*    m n o p                                                            */
3468
    /*    Q r S t                                                            */
3469
    /*                                                                       */
3470
    /*    Approximate QPEL logic                                             */
3471
    /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
3472
    /*    for any given pt, we can get all the information required about    */
3473
    /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
3474
    /*     surrounding pts info:                                             */
3475
    /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
3476
    /*           buffer 2: hxfy, offsets for both are 0, 0                   */
3477
    /*    similarly for other pts the info can be gotten                     */
3478
    /*************************************************************************/
3479
    i4_grid_mask = GRID_ALL_PTS_VALID ^ (BIT_EN(PT_C));
3480
    i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3481
3482
    /*************************************************************************/
3483
    /* One time preparation of non changing interpolation params. These      */
3484
    /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
3485
    /* working memory (not used though in case of averaging).                */
3486
    /*************************************************************************/
3487
    s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
3488
    s_interp_prms.i4_ref_stride = i4_ref_stride;
3489
    s_interp_prms.i4_blk_wd = i4_blk_wd;
3490
    s_interp_prms.i4_blk_ht = i4_blk_ht;
3491
3492
    i4_final_out_stride = i4_ref_stride;
3493
3494
    {
3495
        U08 *pu1_mem;
3496
        /*********************************************************************/
3497
        /* Allocation of working memory for interpolated buffers. We maintain*/
3498
        /* an intermediate working buffer, and 2 ping pong interpolated out  */
3499
        /* buffers, purpose of ping pong explained later below               */
3500
        /*********************************************************************/
3501
        pu1_mem = ps_prms->pu1_wkg_mem;
3502
        s_interp_prms.pu1_wkg_mem = pu1_mem;
3503
3504
        //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
3505
        s_interp_prms.apu1_interp_out[0] = pu1_mem;
3506
3507
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3508
        s_interp_prms.apu1_interp_out[1] = pu1_mem;
3509
3510
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3511
        s_interp_prms.apu1_interp_out[2] = pu1_mem;
3512
3513
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3514
        s_interp_prms.apu1_interp_out[3] = pu1_mem;
3515
3516
        pu1_mem += (INTERP_OUT_BUF_SIZE);
3517
        s_interp_prms.apu1_interp_out[4] = pu1_mem;
3518
3519
        /*********************************************************************/
3520
        /* Stride of interpolated output is just a function of blk width of  */
3521
        /* this partition and hence remains constant for this partition      */
3522
        /*********************************************************************/
3523
        s_interp_prms.i4_out_stride = (i4_blk_wd);
3524
    }
3525
3526
    {
3527
        UWORD8 *apu1_final[4];
3528
        WORD32 ai4_ref_stride[4];
3529
        /*************************************************************************/
3530
        /* Ping pong design for interpolated buffers. We use a min id, which     */
3531
        /* tracks the id of the ppu1_interp_out that stores the best result.     */
3532
        /* When new interp to be done, it uses 1 - bes result id to do the interp*/
3533
        /* min id is toggled when any new result becomes the best result.        */
3534
        /*************************************************************************/
3535
3536
        for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
3537
        {
3538
            e_min_id = PT_C;
3539
3540
            hme_qpel_interp_comprehensive(
3541
                &s_interp_prms, apu1_final, ai4_ref_stride, i4_mv_x, i4_mv_y, i4_grid_mask);
3542
3543
            mvx_qpel = i4_mv_x;
3544
            mvy_qpel = i4_mv_y;
3545
3546
            if(i4_grid_mask & BIT_EN(PT_L))
3547
            {
3548
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3549
                    ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 0, check_for_duplicate);
3550
3551
                if(!check_for_duplicate)
3552
                {
3553
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3554
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3555
3556
                    s_err_prms.pu1_ref = apu1_final[0];
3557
                    s_err_prms.i4_ref_stride = ai4_ref_stride[0];
3558
3559
                    pf_err_compute(&s_err_prms);
3560
                    /* Update the mv's with the current candt motion vectors */
3561
                    s_result_prms.i2_mv_x = mvx_qpel - 1;
3562
                    s_result_prms.i2_mv_y = mvy_qpel;
3563
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3564
3565
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3566
                    if(i4_tot_cost < i4_min_cost)
3567
                    {
3568
                        e_min_id = PT_L;
3569
                        i4_min_cost = i4_tot_cost;
3570
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3571
                    }
3572
                }
3573
            }
3574
            if(i4_grid_mask & BIT_EN(PT_T))
3575
            {
3576
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3577
                    ps_dedup_enabler, 1, mvx_qpel - 0, mvy_qpel - 1, check_for_duplicate);
3578
3579
                if(!check_for_duplicate)
3580
                {
3581
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3582
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3583
3584
                    s_err_prms.pu1_ref = apu1_final[1];
3585
                    s_err_prms.i4_ref_stride = ai4_ref_stride[1];
3586
3587
                    pf_err_compute(&s_err_prms);
3588
                    /* Update the mv's with the current candt motion vectors */
3589
                    s_result_prms.i2_mv_x = mvx_qpel;
3590
                    s_result_prms.i2_mv_y = mvy_qpel - 1;
3591
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3592
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3593
                    if(i4_tot_cost < i4_min_cost)
3594
                    {
3595
                        e_min_id = PT_T;
3596
                        i4_min_cost = i4_tot_cost;
3597
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3598
                    }
3599
                }
3600
            }
3601
            if(i4_grid_mask & BIT_EN(PT_R))
3602
            {
3603
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3604
                    ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
3605
3606
                if(!check_for_duplicate)
3607
                {
3608
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3609
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3610
3611
                    s_err_prms.pu1_ref = apu1_final[2];
3612
                    s_err_prms.i4_ref_stride = ai4_ref_stride[2];
3613
3614
                    pf_err_compute(&s_err_prms);
3615
                    /* Update the mv's with the current candt motion vectors */
3616
                    s_result_prms.i2_mv_x = mvx_qpel + 1;
3617
                    s_result_prms.i2_mv_y = mvy_qpel;
3618
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3619
3620
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3621
                    if(i4_tot_cost < i4_min_cost)
3622
                    {
3623
                        e_min_id = PT_R;
3624
                        i4_min_cost = i4_tot_cost;
3625
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3626
                    }
3627
                }
3628
            }
3629
            /* i4_mv_x and i4_mv_y will always be the centre pt */
3630
            /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
3631
            if(i4_grid_mask & BIT_EN(PT_B))
3632
            {
3633
                CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3634
                    ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
3635
3636
                if(!check_for_duplicate)
3637
                {
3638
                    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3639
                    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3640
3641
                    s_err_prms.pu1_ref = apu1_final[3];
3642
                    s_err_prms.i4_ref_stride = ai4_ref_stride[3];
3643
3644
                    pf_err_compute(&s_err_prms);
3645
                    /* Update the mv's with the current candt motion vectors */
3646
                    s_result_prms.i2_mv_x = mvx_qpel;
3647
                    s_result_prms.i2_mv_y = mvy_qpel + 1;
3648
                    hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3649
                    i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3650
                    if(i4_tot_cost < i4_min_cost)
3651
                    {
3652
                        e_min_id = PT_B;
3653
                        i4_min_cost = i4_tot_cost;
3654
                        i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3655
                    }
3656
                }
3657
            }
3658
3659
            if(e_min_id == PT_C)
3660
            {
3661
                if(!i4_i)
3662
                {
3663
                    S32 i4_interp_buf_id = 0;
3664
3665
                    if(i4_grid_mask & BIT_EN(PT_TL))
3666
                    {
3667
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3668
                            ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 1, check_for_duplicate);
3669
3670
                        if(!check_for_duplicate)
3671
                        {
3672
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3673
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3674
3675
                            /* Carry out the interpolation */
3676
                            pf_qpel_interp(
3677
                                &s_interp_prms, i4_mv_x - 1, i4_mv_y - 1, i4_interp_buf_id);
3678
3679
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3680
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3681
3682
                            pf_err_compute(&s_err_prms);
3683
                            /* Update the mv's with the current candt motion vectors */
3684
                            s_result_prms.i2_mv_x = mvx_qpel - 1;
3685
                            s_result_prms.i2_mv_y = mvy_qpel - 1;
3686
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3687
3688
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3689
3690
                            if(i4_tot_cost < i4_min_cost)
3691
                            {
3692
                                e_min_id = PT_TL;
3693
                                i4_min_cost = i4_tot_cost;
3694
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3695
                            }
3696
                        }
3697
                    }
3698
                    if(i4_grid_mask & BIT_EN(PT_TR))
3699
                    {
3700
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3701
                            ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel - 1, check_for_duplicate);
3702
3703
                        if(!check_for_duplicate)
3704
                        {
3705
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3706
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3707
3708
                            /* Carry out the interpolation */
3709
                            pf_qpel_interp(
3710
                                &s_interp_prms, i4_mv_x + 1, i4_mv_y - 1, i4_interp_buf_id);
3711
3712
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3713
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3714
3715
                            pf_err_compute(&s_err_prms);
3716
                            /* Update the mv's with the current candt motion vectors */
3717
                            s_result_prms.i2_mv_x = mvx_qpel + 1;
3718
                            s_result_prms.i2_mv_y = mvy_qpel - 1;
3719
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3720
3721
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3722
3723
                            if(i4_tot_cost < i4_min_cost)
3724
                            {
3725
                                e_min_id = PT_TR;
3726
                                i4_min_cost = i4_tot_cost;
3727
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3728
                            }
3729
                        }
3730
                    }
3731
                    if(i4_grid_mask & BIT_EN(PT_BL))
3732
                    {
3733
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3734
                            ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel + 1, check_for_duplicate);
3735
3736
                        if(!check_for_duplicate)
3737
                        {
3738
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3739
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3740
3741
                            /* Carry out the interpolation */
3742
                            pf_qpel_interp(
3743
                                &s_interp_prms, i4_mv_x - 1, i4_mv_y + 1, i4_interp_buf_id);
3744
3745
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3746
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3747
3748
                            pf_err_compute(&s_err_prms);
3749
                            /* Update the mv's with the current candt motion vectors */
3750
                            s_result_prms.i2_mv_x = mvx_qpel - 1;
3751
                            s_result_prms.i2_mv_y = mvy_qpel + 1;
3752
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3753
3754
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3755
3756
                            if(i4_tot_cost < i4_min_cost)
3757
                            {
3758
                                e_min_id = PT_BL;
3759
                                i4_min_cost = i4_tot_cost;
3760
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3761
                            }
3762
                        }
3763
                    }
3764
                    /* i4_mv_x and i4_mv_y will always be the centre pt */
3765
                    /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
3766
                    if(i4_grid_mask & BIT_EN(PT_BR))
3767
                    {
3768
                        CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3769
                            ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel + 1, check_for_duplicate);
3770
3771
                        if(!check_for_duplicate)
3772
                        {
3773
                            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3774
                            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3775
3776
                            /* Carry out the interpolation */
3777
                            pf_qpel_interp(
3778
                                &s_interp_prms, i4_mv_x + 1, i4_mv_y + 1, i4_interp_buf_id);
3779
3780
                            s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3781
                            s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3782
3783
                            pf_err_compute(&s_err_prms);
3784
                            /* Update the mv's with the current candt motion vectors */
3785
                            s_result_prms.i2_mv_x = mvx_qpel + 1;
3786
                            s_result_prms.i2_mv_y = mvy_qpel + 1;
3787
                            hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3788
3789
                            i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3790
3791
                            if(i4_tot_cost < i4_min_cost)
3792
                            {
3793
                                e_min_id = PT_BR;
3794
                                i4_min_cost = i4_tot_cost;
3795
                                i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3796
                            }
3797
                        }
3798
                    }
3799
                    if(e_min_id == PT_C)
3800
                    {
3801
                        break;
3802
                    }
3803
                }
3804
                else
3805
                {
3806
                    break;
3807
                }
3808
            }
3809
3810
            if(i4_i)
3811
            {
3812
                i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3813
            }
3814
            else
3815
            {
3816
                i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3817
            }
3818
            i4_mv_x += gai1_grid_id_to_x[e_min_id];
3819
            i4_mv_y += gai1_grid_id_to_y[e_min_id];
3820
            ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3821
            ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3822
            i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3823
        }
3824
    }
3825
3826
    /* update modified motion vectors and cost at end of subpel */
3827
    ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3828
    ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3829
    ps_search_node->i4_tot_cost = i4_min_cost;
3830
    ps_search_node->i4_sad = i4_min_sad;
3831
3832
    /********************************************************************************/
3833
    /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
3834
    /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
3835
    /********************************************************************************/
3836
    //ps_pred_ctxt->lambda >>= 1;
3837
3838
    return (i4_min_cost);
3839
}
3840
#endif
3841
3842
static void hme_subpel_refine_struct_to_search_results_struct_converter(
3843
    subpel_refine_ctxt_t *ps_subpel_refine_ctxt,
3844
    search_results_t *ps_search_results,
3845
    U08 u1_pred_dir,
3846
    ME_QUALITY_PRESETS_T e_quality_preset)
3847
3.43M
{
3848
3.43M
    U08 i;
3849
3850
3.43M
    U08 u1_num_results_per_part = ps_search_results->u1_num_results_per_part;
3851
3852
29.1M
    for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
3853
25.6M
    {
3854
25.6M
        S32 index;
3855
25.6M
        S32 i4_sad;
3856
3857
25.6M
        S32 part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
3858
3859
25.6M
        search_node_t *ps_best_node = ps_search_results->aps_part_results[u1_pred_dir][part_id];
3860
3861
25.6M
        if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
3862
23.3M
        {
3863
23.3M
            index = part_id;
3864
23.3M
        }
3865
2.33M
        else
3866
2.33M
        {
3867
2.33M
            index = i;
3868
2.33M
        }
3869
3870
25.6M
        if(!ps_best_node->u1_subpel_done)
3871
14.5M
        {
3872
14.5M
            i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3873
14.5M
                     ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3874
14.5M
            ps_best_node[0].i4_sdi = 0;
3875
14.5M
            ASSERT((e_quality_preset == ME_PRISTINE_QUALITY) ? (ps_best_node[0].i4_sdi >= 0) : 1);
3876
14.5M
            ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3877
3878
14.5M
            if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3879
2.12k
            {
3880
2.12k
                i4_sad = MAX_SIGNED_16BIT_VAL;
3881
2.12k
            }
3882
3883
14.5M
            ps_best_node[0].i4_sad = i4_sad;
3884
14.5M
            ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3885
14.5M
            ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3886
14.5M
            ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3887
14.5M
            ps_best_node[0].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3888
14.5M
            ps_best_node->u1_subpel_done = 1;
3889
3890
14.5M
            if(2 == u1_num_results_per_part)
3891
0
            {
3892
0
                i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3893
0
                         ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3894
0
                ps_best_node[1].i4_sdi = 0;
3895
0
                ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3896
3897
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3898
0
                {
3899
0
                    i4_sad = MAX_SIGNED_16BIT_VAL;
3900
0
                }
3901
3902
0
                ps_best_node[1].i4_sad = i4_sad;
3903
0
                ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3904
0
                ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3905
0
                ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3906
0
                ps_best_node[1].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3907
0
                ps_best_node[1].u1_subpel_done = 1;
3908
0
            }
3909
14.5M
        }
3910
11.1M
        else if(
3911
11.1M
            (2 == u1_num_results_per_part) &&
3912
11.1M
            (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[1].i4_tot_cost))
3913
0
        {
3914
0
            if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] < ps_best_node[0].i4_tot_cost)
3915
0
            {
3916
0
                i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3917
0
                         ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3918
0
                ps_best_node[0].i4_sdi = 0;
3919
0
                ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3920
3921
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3922
0
                {
3923
0
                    i4_sad = MAX_SIGNED_16BIT_VAL;
3924
0
                }
3925
3926
0
                ps_best_node[0].i4_sad = i4_sad;
3927
0
                ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3928
0
                ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3929
0
                ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3930
0
                ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3931
3932
0
                i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3933
0
                         ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3934
0
                ps_best_node[1].i4_sdi = 0;
3935
0
                ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3936
3937
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3938
0
                {
3939
0
                    i4_sad = MAX_SIGNED_16BIT_VAL;
3940
0
                }
3941
3942
0
                ps_best_node[1].i4_sad = i4_sad;
3943
0
                ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3944
0
                ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3945
0
                ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3946
0
                ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3947
0
            }
3948
0
            else if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] > ps_best_node[0].i4_tot_cost)
3949
0
            {
3950
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >= ps_best_node[0].i4_tot_cost)
3951
0
                {
3952
0
                    i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3953
0
                             ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3954
0
                    ps_best_node[1].i4_sdi = 0;
3955
0
                    ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3956
3957
0
                    if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3958
0
                    {
3959
0
                        i4_sad = MAX_SIGNED_16BIT_VAL;
3960
0
                    }
3961
3962
0
                    ps_best_node[1].i4_sad = i4_sad;
3963
0
                    ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3964
0
                    ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3965
0
                    ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3966
0
                    ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3967
0
                }
3968
0
                else if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost)
3969
0
                {
3970
0
                    memmove(&ps_best_node[1], &ps_best_node[0], sizeof(search_node_t));
3971
3972
0
                    i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3973
0
                             ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3974
0
                    ps_best_node[0].i4_sdi = 0;
3975
0
                    ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3976
3977
0
                    if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3978
0
                    {
3979
0
                        i4_sad = MAX_SIGNED_16BIT_VAL;
3980
0
                    }
3981
3982
0
                    ps_best_node[0].i4_sad = i4_sad;
3983
0
                    ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3984
0
                    ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3985
0
                    ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3986
0
                    ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3987
0
                }
3988
0
            }
3989
0
        }
3990
11.1M
        else if(
3991
11.1M
            (1 == u1_num_results_per_part) &&
3992
11.1M
            (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost))
3993
382k
        {
3994
382k
            i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3995
382k
                     ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3996
382k
            ps_best_node[0].i4_sdi = 0;
3997
382k
            ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3998
3999
382k
            if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
4000
0
            {
4001
0
                i4_sad = MAX_SIGNED_16BIT_VAL;
4002
0
            }
4003
4004
382k
            ps_best_node[0].i4_sad = i4_sad;
4005
382k
            ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4006
382k
            ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4007
382k
            ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4008
382k
            ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
4009
382k
        }
4010
25.6M
    }
4011
3.43M
}
4012
4013
/**
4014
********************************************************************************
4015
*  @fn     S32 hme_subpel_refine_cu_hs
4016
*
4017
*  @brief  Evaluates the best subpel mvs for active partitions of an MB in L0
4018
*          layer for the high speed preset. Recursive hadamard SATD / SAD
4019
*          and mv cost is used for 2NxN and NxN partitions with active partition
4020
*          update
4021
*
4022
*  @param[in]  ps_prms: subpel prms input to this function
4023
*
4024
*  @param[in]  ps_curr_layer: points to the current layer ctxt
4025
*
4026
*  @param[out] ps_search_results: points to the search resutls that get updated
4027
*              with best results
4028
*
4029
*  @param[in]  search_idx:  ref id of the frame for which results get updated
4030
*
4031
*  @param[in]  ps_wt_inp_prms:  current frame input params
4032
*
4033
*  @return     None
4034
********************************************************************************
4035
*/
4036
void hme_subpel_refine_cu_hs(
4037
    hme_subpel_prms_t *ps_prms,
4038
    layer_ctxt_t *ps_curr_layer,
4039
    search_results_t *ps_search_results,
4040
    S32 search_idx,
4041
    wgt_pred_ctxt_t *ps_wt_inp_prms,
4042
    WORD32 blk_8x8_mask,
4043
    me_func_selector_t *ps_func_selector,
4044
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
4045
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
4046
3.43M
{
4047
    /* Unique search node list for 2nx2n and nxn partitions */
4048
3.43M
    search_node_t as_nodes_2nx2n[MAX_RESULTS_PER_PART * 5];
4049
3.43M
    subpel_dedup_enabler_t as_subpel_dedup_enabler[MAX_NUM_REF];
4050
3.43M
    search_node_t *ps_search_node;
4051
4052
3.43M
    S32 i, i4_part_mask, j;
4053
3.43M
    S32 i4_sad_grid;
4054
3.43M
    S32 max_subpel_cand;
4055
3.43M
    WORD32 index;
4056
3.43M
    S32 num_unique_nodes_2nx2n;
4057
3.43M
    S32 part_id;
4058
3.43M
    S32 x_off, y_off;
4059
3.43M
    S32 i4_inp_off;
4060
4061
3.43M
    CU_SIZE_T e_cu_size;
4062
3.43M
    BLK_SIZE_T e_blk_size;
4063
4064
3.43M
    subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
4065
4066
3.43M
    S32 i4_use_satd = ps_prms->i4_use_satd;
4067
3.43M
    S32 i4_num_act_refs = ps_prms->i4_num_act_ref_l0 + ps_prms->i4_num_act_ref_l1;
4068
4069
3.43M
    ASSERT(ps_search_results->u1_num_results_per_part <= MAX_RESULTS_PER_PART);
4070
4071
3.43M
    if(!DISABLE_SUBPEL_REFINEMENT_WHEN_SRC_IS_NOISY || !ps_prms->u1_is_cu_noisy)
4072
3.43M
    {
4073
3.43M
        e_cu_size = ps_search_results->e_cu_size;
4074
3.43M
        i4_part_mask = ps_search_results->i4_part_mask;
4075
4076
3.43M
        ps_prms->i4_inp_type = sizeof(U08);
4077
4078
3.43M
        num_unique_nodes_2nx2n = 0;
4079
4080
11.5M
        for(i = 0; i < i4_num_act_refs; i++)
4081
8.12M
        {
4082
8.12M
            as_subpel_dedup_enabler[i].u1_ref_idx = MAX_NUM_REF;
4083
8.12M
        }
4084
4085
        /************************************************************************/
4086
        /*                                                                      */
4087
        /*  Initialize SATD cost for each valid partition id.one time before    */
4088
        /*  doing full pel time. This is because of the following reasons:      */
4089
        /*   1. Full pel cost was done in  SAD while subpel is in SATD mode     */
4090
        /*   2. Partitions like AMP, Nx2N and 2NxN are refined on the fly while */
4091
        /*      doing Diamond search for 2Nx2N and NxN. This partitions are     */
4092
        /*      not explicitly refine in high speed mode                        */
4093
        /*                                                                      */
4094
        /************************************************************************/
4095
29.1M
        for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4096
25.6M
        {
4097
25.6M
            S32 enable_subpel = 0;
4098
25.6M
            S32 part_type;
4099
4100
            /* Derive the x and y offsets of this part id */
4101
25.6M
            part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4102
25.6M
            if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4103
23.3M
            {
4104
23.3M
                index = part_id;
4105
23.3M
            }
4106
2.33M
            else
4107
2.33M
            {
4108
2.33M
                index = i;
4109
2.33M
            }
4110
4111
25.6M
            part_type = ge_part_id_to_part_type[part_id];
4112
25.6M
            x_off = gas_part_attr_in_cu[part_id].u1_x_start << e_cu_size;
4113
25.6M
            y_off = gas_part_attr_in_cu[part_id].u1_y_start << e_cu_size;
4114
25.6M
            x_off += ps_search_results->u1_x_off;
4115
25.6M
            y_off += ps_search_results->u1_y_off;
4116
25.6M
            i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4117
25.6M
            e_blk_size = ge_part_id_to_blk_size[e_cu_size][part_id];
4118
4119
25.6M
            x_off += ps_prms->i4_ctb_x_off;
4120
25.6M
            y_off += ps_prms->i4_ctb_y_off;
4121
4122
25.6M
            max_subpel_cand = 0;
4123
4124
            /* Choose the minimum number of candidates to be used for Sub pel refinement */
4125
25.6M
            if(PART_ID_2Nx2N == part_type)
4126
3.38M
            {
4127
3.38M
                max_subpel_cand =
4128
3.38M
                    MIN(ps_prms->u1_max_subpel_candts_2Nx2N,
4129
3.38M
                        ps_search_results->u1_num_results_per_part);
4130
3.38M
            }
4131
22.3M
            else if(PRT_NxN == part_type)
4132
5.77M
            {
4133
5.77M
                max_subpel_cand = MIN(
4134
5.77M
                    ps_prms->u1_max_subpel_candts_NxN, ps_search_results->u1_num_results_per_part);
4135
5.77M
            }
4136
4137
            /* If incomplete CTB, NxN num candidates should be forced to min 1 */
4138
25.6M
            if((0 == max_subpel_cand) && (blk_8x8_mask != 15))
4139
95.6k
            {
4140
95.6k
                max_subpel_cand = 1;
4141
95.6k
            }
4142
4143
25.6M
            if((PART_ID_2Nx2N == part_type) || (PRT_NxN == part_type))
4144
9.16M
            {
4145
9.16M
                enable_subpel = 1;
4146
9.16M
            }
4147
4148
            /* Compute full pel SATD for each result per partition before subpel */
4149
            /* refinement starts.                                                */
4150
            /* Also prepare unique candidate list for 2Nx2N and NxN partitions   */
4151
51.3M
            for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4152
25.6M
            {
4153
25.6M
                err_prms_t s_err_prms;
4154
25.6M
                S32 i4_satd = 0;
4155
25.6M
                S32 i1_ref_idx;
4156
25.6M
                U08 *pu1_ref_base;
4157
25.6M
                S32 i4_ref_stride = ps_curr_layer->i4_rec_stride;
4158
25.6M
                S32 i4_mv_x, i4_mv_y;
4159
4160
25.6M
                ps_search_node = ps_search_results->aps_part_results[search_idx][part_id] + j;
4161
4162
25.6M
                if(ps_subpel_refine_ctxt->i2_mv_x[j][index] == INTRA_MV)
4163
0
                {
4164
0
                    ps_search_node->u1_subpel_done = 1;
4165
0
                    continue;
4166
0
                }
4167
4168
25.6M
                i1_ref_idx = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4169
25.6M
                ps_prms->pv_inp = (void *)(ps_wt_inp_prms->apu1_wt_inp[i1_ref_idx] + i4_inp_off);
4170
25.6M
                pu1_ref_base = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx];
4171
4172
25.6M
                i4_mv_x = ps_subpel_refine_ctxt->i2_mv_x[j][index];
4173
25.6M
                i4_mv_y = ps_subpel_refine_ctxt->i2_mv_y[j][index];
4174
4175
25.6M
                if(i4_use_satd)
4176
23.9M
                {
4177
23.9M
                    s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
4178
23.9M
                    s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
4179
23.9M
                    s_err_prms.pu1_ref = pu1_ref_base + x_off + (y_off * i4_ref_stride) + i4_mv_x +
4180
23.9M
                                         (i4_mv_y * i4_ref_stride);
4181
4182
23.9M
                    s_err_prms.i4_ref_stride = i4_ref_stride;
4183
23.9M
                    s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
4184
23.9M
                    s_err_prms.i4_grid_mask = 1;
4185
23.9M
                    s_err_prms.pi4_sad_grid = &i4_sad_grid;
4186
23.9M
                    s_err_prms.i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
4187
23.9M
                    s_err_prms.i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
4188
4189
23.9M
                    s_err_prms.ps_cmn_utils_optimised_function_list =
4190
23.9M
                        ps_cmn_utils_optimised_function_list;
4191
4192
23.9M
                    compute_satd_8bit(&s_err_prms);
4193
4194
23.9M
                    i4_satd = s_err_prms.pi4_sad_grid[0];
4195
4196
23.9M
                    ps_subpel_refine_ctxt->i2_tot_cost[j][index] =
4197
23.9M
                        CLIP_S16(ps_subpel_refine_ctxt->i2_mv_cost[j][index] + i4_satd);
4198
23.9M
                    ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index] = i4_satd;
4199
23.9M
                }
4200
4201
                /* Sub-pel candidate filtration */
4202
25.6M
                if(j)
4203
0
                {
4204
0
                    S16 i2_best_sad;
4205
0
                    S32 i4_best_mvx;
4206
0
                    S32 i4_best_mvy;
4207
4208
0
                    search_node_t *ps_node =
4209
0
                        ps_search_results->aps_part_results[search_idx][part_id];
4210
4211
0
                    U08 u1_is_subpel_done = ps_node->u1_subpel_done;
4212
0
                    S16 i2_curr_sad = ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index];
4213
0
                    S32 i4_curr_mvx = i4_mv_x << 2;
4214
0
                    S32 i4_curr_mvy = i4_mv_y << 2;
4215
4216
0
                    if(u1_is_subpel_done)
4217
0
                    {
4218
0
                        i2_best_sad = ps_node->i4_sad;
4219
4220
0
                        if(ps_node->i1_ref_idx == i1_ref_idx)
4221
0
                        {
4222
0
                            i4_best_mvx = ps_node->s_mv.i2_mvx;
4223
0
                            i4_best_mvy = ps_node->s_mv.i2_mvy;
4224
0
                        }
4225
0
                        else if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4226
0
                        {
4227
0
                            i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4228
0
                            i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4229
0
                        }
4230
0
                        else
4231
0
                        {
4232
0
                            i4_best_mvx = INTRA_MV;
4233
0
                            i4_best_mvy = INTRA_MV;
4234
0
                        }
4235
0
                    }
4236
0
                    else
4237
0
                    {
4238
0
                        i2_best_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
4239
0
                                      ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4240
4241
0
                        if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4242
0
                        {
4243
0
                            i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4244
0
                            i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4245
0
                        }
4246
0
                        else
4247
0
                        {
4248
0
                            i4_best_mvx = INTRA_MV;
4249
0
                            i4_best_mvy = INTRA_MV;
4250
0
                        }
4251
0
                    }
4252
4253
0
                    i2_best_sad += (i2_best_sad >> ps_prms->u1_subpel_candt_threshold);
4254
4255
0
                    if(((ABS(i4_curr_mvx - i4_best_mvx) < 2) &&
4256
0
                        (ABS(i4_curr_mvy - i4_best_mvy) < 2)) ||
4257
0
                       (i2_curr_sad > i2_best_sad))
4258
0
                    {
4259
0
                        enable_subpel = 0;
4260
0
                    }
4261
0
                }
4262
4263
25.6M
                ps_search_node->u1_part_id = part_id;
4264
4265
                /* Convert mvs in part results from FPEL to QPEL units */
4266
25.6M
                ps_subpel_refine_ctxt->i2_mv_x[j][index] <<= 2;
4267
25.6M
                ps_subpel_refine_ctxt->i2_mv_y[j][index] <<= 2;
4268
4269
                /* If the candidate number is more than the number of candts
4270
                set initally, do not add those candts for refinement */
4271
25.6M
                if(j >= max_subpel_cand)
4272
16.5M
                {
4273
16.5M
                    enable_subpel = 0;
4274
16.5M
                }
4275
4276
25.6M
                if(enable_subpel)
4277
9.12M
                {
4278
9.12M
                    if(num_unique_nodes_2nx2n == 0)
4279
3.43M
                    {
4280
3.43M
                        S32 i4_index = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4281
4282
3.43M
                        as_subpel_dedup_enabler[i4_index].i2_mv_x =
4283
3.43M
                            ps_subpel_refine_ctxt->i2_mv_x[j][index];
4284
3.43M
                        as_subpel_dedup_enabler[i4_index].i2_mv_y =
4285
3.43M
                            ps_subpel_refine_ctxt->i2_mv_y[j][index];
4286
3.43M
                        as_subpel_dedup_enabler[i4_index].u1_ref_idx =
4287
3.43M
                            (U08)ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4288
3.43M
                        memset(
4289
3.43M
                            as_subpel_dedup_enabler[i4_index].au4_node_map,
4290
3.43M
                            0,
4291
3.43M
                            sizeof(U32) * 2 * MAP_X_MAX);
4292
3.43M
                    }
4293
9.12M
                    INSERT_NEW_NODE_NOMAP_ALTERNATE(
4294
9.12M
                        as_nodes_2nx2n, num_unique_nodes_2nx2n, ps_subpel_refine_ctxt, j, i);
4295
9.12M
                }
4296
25.6M
            }
4297
4298
            /*********************************************************************************************/
4299
            /* If sad_1 < sad_2, then satd_1 need not be lesser than satd_2. Therefore, after conversion */
4300
            /* to satd, tot_cost_1 may not be lesser than tot_cost_2. So we need to sort the search nodes*/
4301
            /* for each partition again, based on the new costs                                          */
4302
            /*********************************************************************************************/
4303
            /*********************************************************************************************/
4304
            /* Because right now, we store only the two best candidates for each partition, the sort will*/
4305
            /* converge to a simple swap.                                                                */
4306
            /* ASSUMPTION : We store only two best results per partition                                 */
4307
            /*********************************************************************************************/
4308
25.6M
            if(ps_search_results->u1_num_results_per_part == 2)
4309
0
            {
4310
0
                if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >
4311
0
                   ps_subpel_refine_ctxt->i2_tot_cost[1][index])
4312
0
                {
4313
0
                    SWAP(
4314
0
                        ps_subpel_refine_ctxt->i2_tot_cost[0][index],
4315
0
                        ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
4316
4317
0
                    SWAP(
4318
0
                        ps_subpel_refine_ctxt->i2_mv_cost[0][index],
4319
0
                        ps_subpel_refine_ctxt->i2_mv_cost[1][index]);
4320
4321
0
                    SWAP(
4322
0
                        ps_subpel_refine_ctxt->i2_mv_x[0][index],
4323
0
                        ps_subpel_refine_ctxt->i2_mv_x[1][index]);
4324
4325
0
                    SWAP(
4326
0
                        ps_subpel_refine_ctxt->i2_mv_y[0][index],
4327
0
                        ps_subpel_refine_ctxt->i2_mv_y[1][index]);
4328
4329
0
                    SWAP(
4330
0
                        ps_subpel_refine_ctxt->i2_ref_idx[0][index],
4331
0
                        ps_subpel_refine_ctxt->i2_ref_idx[1][index]);
4332
4333
0
                    SWAP(
4334
0
                        ps_subpel_refine_ctxt->ai2_fullpel_satd[0][index],
4335
0
                        ps_subpel_refine_ctxt->ai2_fullpel_satd[1][index]);
4336
0
                }
4337
0
            }
4338
25.6M
        }
4339
4340
3.43M
        if(blk_8x8_mask == 0xf)
4341
3.38M
        {
4342
3.38M
            num_unique_nodes_2nx2n =
4343
3.38M
                MIN(num_unique_nodes_2nx2n, ps_prms->u1_max_num_subpel_refine_centers);
4344
3.38M
        }
4345
3.43M
        {
4346
3.43M
            x_off = gas_part_attr_in_cu[0].u1_x_start << e_cu_size;
4347
3.43M
            y_off = gas_part_attr_in_cu[0].u1_y_start << e_cu_size;
4348
3.43M
            x_off += ps_search_results->u1_x_off;
4349
3.43M
            y_off += ps_search_results->u1_y_off;
4350
3.43M
            i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4351
3.43M
            e_blk_size = ge_part_id_to_blk_size[e_cu_size][0];
4352
4353
7.10M
            for(j = 0; j < num_unique_nodes_2nx2n; j++)
4354
3.67M
            {
4355
3.67M
                S32 pred_lx;
4356
3.67M
                ps_search_node = &as_nodes_2nx2n[j];
4357
4358
3.67M
                if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
4359
0
                {
4360
0
                    continue;
4361
0
                }
4362
4363
3.67M
                {
4364
3.67M
                    S08 i1_ref_idx = ps_search_node->i1_ref_idx;
4365
3.67M
                    subpel_dedup_enabler_t *ps_dedup_enabler =
4366
3.67M
                        &(as_subpel_dedup_enabler[i1_ref_idx]);
4367
4368
3.67M
                    if(ps_dedup_enabler->u1_ref_idx == MAX_NUM_REF)
4369
10.6k
                    {
4370
10.6k
                        as_subpel_dedup_enabler[i1_ref_idx].i2_mv_x = ps_search_node->s_mv.i2_mvx;
4371
10.6k
                        as_subpel_dedup_enabler[i1_ref_idx].i2_mv_y = ps_search_node->s_mv.i2_mvy;
4372
10.6k
                        as_subpel_dedup_enabler[i1_ref_idx].u1_ref_idx = i1_ref_idx;
4373
10.6k
                        memset(
4374
10.6k
                            as_subpel_dedup_enabler[i1_ref_idx].au4_node_map,
4375
10.6k
                            0,
4376
10.6k
                            sizeof(U32) * 2 * MAP_X_MAX);
4377
10.6k
                    }
4378
3.67M
                }
4379
4380
3.67M
                pred_lx = search_idx;
4381
3.67M
                ps_prms->pv_inp =
4382
3.67M
                    (void *)(ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off);
4383
4384
3.67M
                hme_subpel_refine_search_node_high_speed(
4385
3.67M
                    ps_search_node,
4386
3.67M
                    ps_prms,
4387
3.67M
                    ps_curr_layer,
4388
3.67M
                    e_blk_size,
4389
3.67M
                    x_off + ps_prms->i4_ctb_x_off,
4390
3.67M
                    y_off + ps_prms->i4_ctb_y_off,
4391
3.67M
                    ps_search_results,
4392
3.67M
                    pred_lx,
4393
3.67M
                    i4_part_mask,
4394
3.67M
                    &ps_subpel_refine_ctxt->ai4_part_id[0],
4395
3.67M
                    search_idx,
4396
3.67M
                    &(as_subpel_dedup_enabler[ps_search_node->i1_ref_idx]),
4397
3.67M
                    ps_func_selector,
4398
3.67M
                    ps_me_optimised_function_list);
4399
3.67M
            }
4400
3.43M
        }
4401
3.43M
    }
4402
0
    else
4403
0
    {
4404
0
        for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4405
0
        {
4406
0
            S32 i4_index;
4407
4408
0
            S32 i4_part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4409
4410
0
            if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4411
0
            {
4412
0
                i4_index = i4_part_id;
4413
0
            }
4414
0
            else
4415
0
            {
4416
0
                i4_index = i;
4417
0
            }
4418
4419
0
            for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4420
0
            {
4421
0
                ps_subpel_refine_ctxt->i2_mv_x[j][i4_index] <<= 2;
4422
0
                ps_subpel_refine_ctxt->i2_mv_y[j][i4_index] <<= 2;
4423
0
            }
4424
0
        }
4425
0
    }
4426
4427
3.43M
    hme_subpel_refine_struct_to_search_results_struct_converter(
4428
3.43M
        ps_subpel_refine_ctxt, ps_search_results, search_idx, ps_prms->e_me_quality_presets);
4429
3.43M
}