Coverage Report

Created: 2025-12-14 06:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/encoder/ihevce_recur_bracketing.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/*!
22
******************************************************************************
23
* \file ihevce_recur_bracketing.c
24
*
25
* \brief
26
*    This file contains interface functions of recursive bracketing
27
*    module
28
* \date
29
*    12/02/2012
30
*
31
* \author
32
*    Ittiam
33
*
34
* List of Functions
35
*
36
*
37
******************************************************************************
38
*/
39
40
/*****************************************************************************/
41
/* File Includes                                                             */
42
/*****************************************************************************/
43
/* System include files */
44
#include <stdio.h>
45
#include <string.h>
46
#include <stdlib.h>
47
#include <assert.h>
48
#include <stdarg.h>
49
#include <math.h>
50
51
/* User include files */
52
#include "ihevc_typedefs.h"
53
#include "itt_video_api.h"
54
#include "ihevce_api.h"
55
56
#include "rc_cntrl_param.h"
57
#include "rc_frame_info_collector.h"
58
#include "rc_look_ahead_params.h"
59
60
#include "ihevc_defs.h"
61
#include "ihevc_structs.h"
62
#include "ihevc_platform_macros.h"
63
#include "ihevc_deblk.h"
64
#include "ihevc_itrans_recon.h"
65
#include "ihevc_chroma_itrans_recon.h"
66
#include "ihevc_chroma_intra_pred.h"
67
#include "ihevc_intra_pred.h"
68
#include "ihevc_inter_pred.h"
69
#include "ihevc_mem_fns.h"
70
#include "ihevc_padding.h"
71
#include "ihevc_weighted_pred.h"
72
#include "ihevc_sao.h"
73
#include "ihevc_resi_trans.h"
74
#include "ihevc_quant_iquant_ssd.h"
75
#include "ihevc_cabac_tables.h"
76
77
#include "ihevce_defs.h"
78
#include "ihevce_lap_enc_structs.h"
79
#include "ihevce_multi_thrd_structs.h"
80
#include "ihevce_me_common_defs.h"
81
#include "ihevce_had_satd.h"
82
#include "ihevce_error_codes.h"
83
#include "ihevce_bitstream.h"
84
#include "ihevce_cabac.h"
85
#include "ihevce_rdoq_macros.h"
86
#include "ihevce_function_selector.h"
87
#include "ihevce_enc_structs.h"
88
#include "ihevce_entropy_structs.h"
89
#include "ihevce_cmn_utils_instr_set_router.h"
90
#include "ihevce_enc_loop_structs.h"
91
#include "ihevce_ipe_instr_set_router.h"
92
#include "ihevce_ipe_structs.h"
93
#include "ihevce_ipe_pass.h"
94
#include "ihevce_recur_bracketing.h"
95
#include "ihevce_nbr_avail.h"
96
#include "ihevc_common_tables.h"
97
#include "ihevce_decomp_pre_intra_structs.h"
98
#include "ihevce_decomp_pre_intra_pass.h"
99
100
#include "cast_types.h"
101
#include "osal.h"
102
#include "osal_defaults.h"
103
104
/*****************************************************************************/
105
/* Constant Macros                                                           */
106
/*****************************************************************************/
107
#define IP_DBG_L1_l2 0
108
1.49M
#define CHILD_BIAS 12
109
110
/*****************************************************************************/
111
/* Globals                                                                   */
112
/*****************************************************************************/
113
extern pf_intra_pred g_apf_lum_ip[10];
114
115
extern WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES];
116
117
UWORD8 gau1_cu_pos_x[64] = { 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7,
118
                             6, 7, 4, 5, 4, 5, 6, 7, 6, 7, 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1,
119
                             2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7 };
120
121
UWORD8 gau1_cu_pos_y[64] = { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 0, 0, 1, 1, 0, 0,
122
                             1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7,
123
                             6, 6, 7, 7, 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7, 6, 6, 7, 7 };
124
125
#define RESET_BIT(x, bit) (x = x & ~((WORD32)1 << bit))
126
127
/*****************************************************************************/
128
/* Function Definitions                                                      */
129
/*****************************************************************************/
130
131
/*!
132
******************************************************************************
133
* \if Function name : ihevce_update_cand_list \endif
134
*
135
* \brief
136
*    Final Candidate list population, nbr flag andd nbr mode update function
137
*
138
* \param[in] ps_row_cu : pointer to cu analyse struct
139
* \param[in] ps_cu_node : pointer to cu node info buffer
140
* \param[in] ps_ed_blk_l1 : pointer to level 1 and 2 decision buffer
141
* \param[in] pu1_cand_mode_list  : pointer to candidate list buffer
142
*
143
* \return
144
*    None
145
*
146
* \author
147
*  Ittiam
148
*
149
*****************************************************************************
150
*/
151
void ihevce_update_cand_list(
152
    ihevce_ipe_cu_tree_t *ps_cu_node, ihevce_ed_blk_t *ps_ed_blk_l1, ihevce_ipe_ctxt_t *ps_ctxt)
153
1.77M
{
154
1.77M
    WORD32 row, col, x, y, size;
155
156
    /* Candidate mode Update */
157
1.77M
    (void)ps_ed_blk_l1;
158
    /* Update CTB mode map for the finalised CU */
159
1.77M
    x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
160
1.77M
    y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
161
1.77M
    size = ps_cu_node->u1_cu_size >> 2;
162
9.90M
    for(row = y; row < (y + size); row++)
163
8.13M
    {
164
55.5M
        for(col = x; col < (x + size); col++)
165
47.4M
        {
166
47.4M
            ps_ctxt->au1_ctb_mode_map[row][col] = ps_cu_node->best_mode;
167
47.4M
        }
168
8.13M
    }
169
1.77M
    return;
170
1.77M
}
171
172
/*!
173
******************************************************************************
174
* \if Function name : ihevce_intra_populate_mode_bits_cost_bracketing \endif
175
*
176
* \brief
177
*    Mpm indx calc function based on left and top available modes
178
*
179
* \param[in] top_intra_mode : Top available intra mode
180
* \param[in] left_intra_mode : Left available intra mode
181
* \param[in] available_top : Top availability flag
182
* \param[in] available_left : Left availability flag
183
* \param[in] cu_pos_y : cu position wrt to CTB
184
* \param[in] mode_bits_cost : pointer to mode bits buffer
185
* \param[in] lambda : Lambda value (SAD/SATD)
186
* \param[in] cand_mode_list  : pointer to candidate list buffer
187
*
188
* \return
189
*    None
190
*
191
* \author
192
*  Ittiam
193
*
194
*****************************************************************************
195
*/
196
void ihevce_intra_populate_mode_bits_cost_bracketing(
197
    WORD32 top_intra_mode,
198
    WORD32 left_intra_mode,
199
    WORD32 available_top,
200
    WORD32 available_left,
201
    WORD32 cu_pos_y,
202
    UWORD16 *mode_bits_cost,
203
    UWORD16 *mode_bits,
204
    WORD32 lambda,
205
    WORD32 *cand_mode_list)
206
9.36M
{
207
    /* local variables */
208
9.36M
    WORD32 i;
209
9.36M
    WORD32 cand_intra_pred_mode_left, cand_intra_pred_mode_top;
210
211
9.36M
    UWORD16 one_bits_cost =
212
9.36M
        COMPUTE_RATE_COST_CLIP30(4, lambda, (LAMBDA_Q_SHIFT + 1));  //1.5 * lambda
213
9.36M
    UWORD16 two_bits_cost =
214
9.36M
        COMPUTE_RATE_COST_CLIP30(6, lambda, (LAMBDA_Q_SHIFT + 1));  //2.5 * lambda
215
9.36M
    UWORD16 five_bits_cost =
216
9.36M
        COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1));  //5.5 * lambda
217
218
337M
    for(i = 0; i < 35; i++)
219
327M
    {
220
327M
        mode_bits_cost[i] = five_bits_cost;
221
327M
        mode_bits[i] = 5;
222
327M
    }
223
224
    /* EIID: set availability flag to zero if modes are invalid.
225
       Required since some CU's might be skipped (though available)
226
       and their modes will be set to 255 (-1)*/
227
9.36M
    if(35 < top_intra_mode || 0 > top_intra_mode)
228
0
        available_top = 0;
229
9.36M
    if(35 < left_intra_mode || 0 > left_intra_mode)
230
0
        available_left = 0;
231
232
    /* Calculate cand_intra_pred_mode_N as per sec. 8.4.2 in JCTVC-J1003_d7 */
233
    /* N = top */
234
9.36M
    if(0 == available_top)
235
1.26M
    {
236
1.26M
        cand_intra_pred_mode_top = INTRA_DC;
237
1.26M
    }
238
    /* for neighbour != INTRA, setting DC is done outside */
239
8.10M
    else if(0 == cu_pos_y) /* It's on the CTB boundary */
240
1.09M
    {
241
1.09M
        cand_intra_pred_mode_top = INTRA_DC;
242
1.09M
    }
243
7.00M
    else
244
7.00M
    {
245
7.00M
        cand_intra_pred_mode_top = top_intra_mode;
246
7.00M
    }
247
248
    /* N = left */
249
9.36M
    if(0 == available_left)
250
1.11M
    {
251
1.11M
        cand_intra_pred_mode_left = INTRA_DC;
252
        //cand_intra_pred_mode_left = cand_intra_pred_mode_top;
253
1.11M
    }
254
    /* for neighbour != INTRA, setting DC is done outside */
255
8.24M
    else
256
8.24M
    {
257
8.24M
        cand_intra_pred_mode_left = left_intra_mode;
258
8.24M
    }
259
260
    /* Calculate cand_mode_list as per sec. 8.4.2 in JCTVC-J1003_d7 */
261
9.36M
    if(cand_intra_pred_mode_left == cand_intra_pred_mode_top)
262
4.66M
    {
263
4.66M
        if(cand_intra_pred_mode_left < 2)
264
3.66M
        {
265
3.66M
            cand_mode_list[0] = INTRA_PLANAR;
266
3.66M
            cand_mode_list[1] = INTRA_DC;
267
3.66M
            cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
268
3.66M
        }
269
998k
        else
270
998k
        {
271
998k
            cand_mode_list[0] = cand_intra_pred_mode_left;
272
998k
            cand_mode_list[1] = 2 + ((cand_intra_pred_mode_left + 29) % 32);
273
998k
            cand_mode_list[2] = 2 + ((cand_intra_pred_mode_left - 2 + 1) % 32);
274
998k
        }
275
4.66M
    }
276
4.70M
    else
277
4.70M
    {
278
4.70M
        if(0 == available_left)
279
786k
        {
280
786k
            cand_mode_list[0] = cand_intra_pred_mode_top;
281
786k
            cand_mode_list[1] = cand_intra_pred_mode_left;
282
786k
        }
283
3.91M
        else
284
3.91M
        {
285
3.91M
            cand_mode_list[0] = cand_intra_pred_mode_left;
286
3.91M
            cand_mode_list[1] = cand_intra_pred_mode_top;
287
3.91M
        }
288
4.70M
        if((cand_intra_pred_mode_left != INTRA_PLANAR) &&
289
3.05M
           (cand_intra_pred_mode_top != INTRA_PLANAR))
290
2.16M
        {
291
2.16M
            cand_mode_list[2] = INTRA_PLANAR;
292
2.16M
        }
293
2.53M
        else if((cand_intra_pred_mode_left != INTRA_DC) && (cand_intra_pred_mode_top != INTRA_DC))
294
570k
        {
295
570k
            cand_mode_list[2] = INTRA_DC;
296
570k
        }
297
1.96M
        else
298
1.96M
        {
299
1.96M
            cand_mode_list[2] = INTRA_ANGULAR(26);
300
1.96M
        }
301
4.70M
    }
302
9.36M
    mode_bits_cost[cand_mode_list[0]] = one_bits_cost;
303
9.36M
    mode_bits_cost[cand_mode_list[1]] = two_bits_cost;
304
9.36M
    mode_bits_cost[cand_mode_list[2]] = two_bits_cost;
305
306
9.36M
    mode_bits[cand_mode_list[0]] = 2;
307
9.36M
    mode_bits[cand_mode_list[1]] = 3;
308
9.36M
    mode_bits[cand_mode_list[2]] = 3;
309
9.36M
}
310
311
/*!
312
******************************************************************************
313
* \if Function name : ihevce_pu_calc_4x4_blk \endif
314
*
315
* \brief
316
*    4x4 pu (8x8 CU) mode decision using step 8421 method
317
*
318
* \param[in] ps_cu_node : pointer to cu node info buffer
319
* \param[in] pu1_src : pointer to src pixels
320
* \param[in] src_stride : frm source stride
321
* \param[in] ref : pointer to reference pixels for prediction
322
* \param[in] cand_mode_list  : pointer to candidate list buffer
323
* \param[in] best_costs_4x4  : pointer to 3 best cost buffer
324
* \param[in] best_modes_4x4  : pointer to 3 best mode buffer
325
*
326
* \return
327
*    None
328
*
329
* \author
330
*  Ittiam
331
*
332
*****************************************************************************
333
*/
334
void ihevce_pu_calc_4x4_blk(
335
    ihevce_ipe_ctxt_t *ps_ctxt,
336
    ihevce_ipe_cu_tree_t *ps_cu_node,
337
    UWORD8 *pu1_src,
338
    WORD32 src_stride,
339
    UWORD8 *ref,
340
    UWORD16 *mode_bits_cost,
341
    WORD32 *best_costs_4x4,
342
    UWORD8 *best_modes_4x4,
343
    func_selector_t *ps_func_selector)
344
2.31M
{
345
2.31M
    WORD16 *pi2_trans_tmp = ps_ctxt->pi2_trans_tmp;
346
2.31M
    WORD16 *pi2_trans_out = ps_ctxt->pi2_trans_out;
347
2.31M
    UWORD8 u1_use_satd = ps_ctxt->u1_use_satd;
348
2.31M
    UWORD8 u1_level_1_refine_on = ps_ctxt->u1_level_1_refine_on;
349
350
2.31M
    WORD32 i, j = 0, i_end;
351
2.31M
    UWORD8 mode, best_amode = 255;
352
2.31M
    UWORD8 pred[16];
353
354
2.31M
    UWORD16 sad;
355
2.31M
    WORD32 sad_cost = 0;
356
2.31M
    WORD32 best_asad_cost = 0xFFFFF;
357
2.31M
    WORD32 temp;
358
2.31M
    UWORD8 modes_to_eval[5];
359
2.31M
    WORD32 costs_4x4[5];
360
2.31M
    UWORD8 modes_4x4[5] = { 0, 1, 2, 3, 4 };
361
362
    /* LO resolution hence low resolution disable */
363
2.31M
    WORD32 u1_low_resol = 0;
364
2.31M
    UWORD8 au1_best_modes[1] = { 0 };
365
2.31M
    WORD32 ai4_best_sad_costs[1] = { 0 };
366
367
2.31M
    WORD16 *pi2_tmp = &pi2_trans_tmp[0];
368
369
2.31M
    ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list =
370
2.31M
        &ps_ctxt->s_ipe_optimised_function_list;
371
372
    //apf_resd_trns[0] = &ihevc_resi_trans_4x4_ttype1;
373
    //apf_resd_trns[0] = &ihevc_HAD_4x4_8bit;
374
375
13.8M
    for(i = 0; i < 5; i++)
376
11.5M
    {
377
11.5M
        costs_4x4[i] = MAX_INTRA_COST_IPE;
378
11.5M
    }
379
380
2.31M
    ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes(
381
2.31M
        pu1_src,
382
2.31M
        src_stride,
383
2.31M
        ref,
384
2.31M
        mode_bits_cost,
385
2.31M
        au1_best_modes,
386
2.31M
        ai4_best_sad_costs,
387
2.31M
        u1_low_resol,
388
2.31M
        ps_ipe_optimised_function_list->pf_4x4_sad_computer);
389
390
2.31M
    best_amode = au1_best_modes[0];
391
2.31M
    best_asad_cost = ai4_best_sad_costs[0];
392
393
2.31M
    ASSERT(best_amode != 255);
394
    /* Around best level 4 angular mode, search for best level 2 mode */
395
2.31M
    modes_to_eval[0] = best_amode - 2;
396
2.31M
    modes_to_eval[1] = best_amode + 2;
397
2.31M
    i = 0;
398
2.31M
    i_end = 2;
399
2.31M
    if(best_amode == 2)
400
170k
        i = 1;
401
2.14M
    else if(best_amode == 34)
402
78.1k
        i_end = 1;
403
6.69M
    for(; i < i_end; i++)
404
4.38M
    {
405
4.38M
        mode = modes_to_eval[i];
406
407
4.38M
        g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
408
409
4.38M
        sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
410
411
4.38M
        sad_cost = sad;
412
4.38M
        sad_cost += mode_bits_cost[mode];
413
414
4.38M
        if(sad_cost < best_asad_cost)
415
297k
        {
416
297k
            best_amode = mode;
417
297k
            best_asad_cost = sad_cost;
418
297k
        }
419
4.38M
    }
420
421
    /* Around best level 2 angular mode, search for best level 1 mode */
422
    /* Also evaluate for non-angular mode */
423
424
2.31M
    i = 0;
425
    /*Level 1 refinement is disabled for ES preset */
426
2.31M
    if(1 == u1_level_1_refine_on)
427
2.31M
    {
428
2.31M
        if(best_amode != 2)
429
2.15M
            modes_to_eval[i++] = best_amode - 1;
430
2.31M
        modes_to_eval[i++] = best_amode;
431
2.31M
    }
432
433
2.31M
    modes_to_eval[i++] = 0;
434
2.31M
    modes_to_eval[i++] = 1;
435
436
2.31M
    if(1 == u1_level_1_refine_on)
437
2.31M
    {
438
2.31M
        if(best_amode != 34)
439
2.24M
            modes_to_eval[i++] = best_amode + 1;
440
2.31M
    }
441
2.31M
    i_end = i;
442
2.31M
    i = 0;
443
444
13.6M
    for(; i < i_end; i++)
445
11.3M
    {
446
11.3M
        mode = modes_to_eval[i];
447
448
11.3M
        g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
449
450
        /* Hard coding to use SATD */
451
11.3M
        if(u1_use_satd)
452
8.17M
        {
453
8.17M
            ps_func_selector->ihevc_resi_trans_4x4_ttype1_fptr(
454
8.17M
                pu1_src, &pred[0], (WORD32 *)pi2_tmp, pi2_trans_out, src_stride, 4, 4, NULL_PLANE);
455
456
8.17M
            sad = ihevce_ipe_pass_satd(pi2_trans_out, 4, 4);
457
8.17M
        }
458
3.16M
        else
459
3.16M
        {
460
3.16M
            sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(
461
3.16M
                pu1_src, &pred[0], src_stride, 4);
462
3.16M
        }
463
11.3M
        sad_cost = sad;
464
11.3M
        sad_cost += mode_bits_cost[mode];
465
466
11.3M
        costs_4x4[i] = sad_cost;
467
11.3M
    }
468
469
    /* Arrange the reference array in ascending order */
470
11.3M
    for(i = 0; i < (i_end - 1); i++)
471
9.03M
    {
472
31.2M
        for(j = i + 1; j < i_end; j++)
473
22.2M
        {
474
22.2M
            if(costs_4x4[i] > costs_4x4[j])
475
7.21M
            {
476
7.21M
                temp = costs_4x4[i];
477
7.21M
                costs_4x4[i] = costs_4x4[j];
478
7.21M
                costs_4x4[j] = temp;
479
480
7.21M
                temp = modes_4x4[i];
481
7.21M
                modes_4x4[i] = modes_4x4[j];
482
7.21M
                modes_4x4[j] = temp;
483
7.21M
            }
484
22.2M
        }
485
9.03M
    }
486
9.26M
    for(i = 0; i < 3; i++)
487
6.94M
    {
488
6.94M
        best_costs_4x4[i] = costs_4x4[i];
489
6.94M
        best_modes_4x4[i] = modes_to_eval[modes_4x4[i]];
490
6.94M
    }
491
492
2.31M
    {
493
2.31M
        ps_cu_node->best_mode = best_modes_4x4[0];
494
2.31M
        ps_cu_node->best_cost = best_costs_4x4[0];
495
2.31M
        ps_cu_node->best_satd = best_costs_4x4[0] - mode_bits_cost[ps_cu_node->best_mode];
496
2.31M
    }
497
2.31M
}
498
499
/*!
500
******************************************************************************
501
* \if Function name : ihevce_pu_calc_8x8_blk \endif
502
*
503
* \brief
504
*    4x4 pu (8x8 CU) mode decision loop using step 8421 method
505
*
506
* \param[in] ps_curr_src : pointer to src pixels struct
507
* \param[in] ps_ctxt : pointer to IPE context struct
508
* \param[in] ps_cu_node : pointer to cu node info buffer
509
*
510
* \return
511
*    None
512
*
513
* \author
514
*  Ittiam
515
*
516
*****************************************************************************
517
*/
518
void ihevce_pu_calc_8x8_blk(
519
    iv_enc_yuv_buf_t *ps_curr_src,
520
    ihevce_ipe_ctxt_t *ps_ctxt,
521
    ihevce_ipe_cu_tree_t *ps_cu_node,
522
    func_selector_t *ps_func_selector)
523
578k
{
524
578k
    WORD32 i, j;
525
578k
    WORD32 nbr_flags;
526
578k
    nbr_avail_flags_t s_nbr;
527
578k
    WORD32 trans_size = ps_cu_node->ps_parent->u1_cu_size >> 1;
528
529
578k
    UWORD8 *pu1_src_4x4;
530
578k
    WORD32 xA, xB, yA, yB;
531
    //WORD32 x, y, size;
532
578k
    WORD32 top_intra_mode;
533
578k
    WORD32 left_intra_mode;
534
    //    WORD8 *top_intra_mode_ptr;
535
    //  WORD8 *left_intra_mode_ptr;
536
578k
    UWORD8 *pu1_orig;
537
578k
    WORD32 src_strd = ps_curr_src->i4_y_strd;
538
539
578k
    WORD32 cu_pos_x = ps_cu_node->ps_parent->u2_x0 << 1;
540
578k
    WORD32 cu_pos_y = ps_cu_node->ps_parent->u2_y0 << 1;
541
578k
    ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
542
543
578k
    ihevc_intra_pred_luma_ref_substitution_fptr =
544
578k
        ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
545
546
578k
    pu1_orig = (UWORD8 *)(ps_curr_src->pv_y_buf) +
547
578k
               ((ps_cu_node->ps_parent->u2_y0 << 3) * src_strd) +
548
578k
               (ps_cu_node->ps_parent->u2_x0 << 3);
549
1.73M
    for(i = 0; i < 2; i++)
550
1.15M
    {
551
3.47M
        for(j = 0; j < 2; j++)
552
2.31M
        {
553
2.31M
            WORD32 cand_mode_list[3];
554
2.31M
            pu1_src_4x4 = pu1_orig + (i * trans_size * src_strd) + (j * trans_size);
555
            /* get the neighbour availability flags */
556
2.31M
            nbr_flags = ihevce_get_nbr_intra(
557
2.31M
                &s_nbr,
558
2.31M
                ps_ctxt->pu1_ctb_nbr_map,
559
2.31M
                ps_ctxt->i4_nbr_map_strd,
560
2.31M
                cu_pos_x + ((j) * (trans_size >> 2)),
561
2.31M
                cu_pos_y + ((i) * (trans_size >> 2)),
562
2.31M
                trans_size >> 2);
563
564
            /* call the function which populates sad cost for all the modes */
565
2.31M
            xA = ((ps_cu_node->ps_parent->u2_x0 << 3) >> 2) + j;
566
2.31M
            yA = ((ps_cu_node->ps_parent->u2_y0 << 3) >> 2) + 1 + i;
567
2.31M
            xB = xA + 1;
568
2.31M
            yB = yA - 1;
569
2.31M
            left_intra_mode = ps_ctxt->au1_ctb_mode_map[yA][xA];
570
2.31M
            top_intra_mode = ps_ctxt->au1_ctb_mode_map[yB][xB];
571
572
2.31M
            ihevce_intra_populate_mode_bits_cost_bracketing(
573
2.31M
                top_intra_mode,
574
2.31M
                left_intra_mode,
575
2.31M
                s_nbr.u1_top_avail,
576
2.31M
                s_nbr.u1_left_avail,
577
2.31M
                ps_cu_node->ps_parent->u2_y0,
578
2.31M
                &ps_ctxt->au2_mode_bits_cost_8x8pu[i * 2 + j][0],
579
2.31M
                &ps_ctxt->au2_mode_bits_8x8_pu[0],
580
2.31M
                ps_ctxt->i4_ol_sad_lambda,
581
2.31M
                cand_mode_list);
582
583
            /* call the function which populates ref data for intra predicion */
584
2.31M
            ihevc_intra_pred_luma_ref_substitution_fptr(
585
2.31M
                pu1_src_4x4 - src_strd - 1,
586
2.31M
                pu1_src_4x4 - src_strd,
587
2.31M
                pu1_src_4x4 - 1,
588
2.31M
                src_strd,
589
2.31M
                4,
590
2.31M
                nbr_flags,
591
2.31M
                &ps_ctxt->au1_ref_8x8pu[i * 2 + j][0],
592
2.31M
                0);
593
594
2.31M
            ihevce_pu_calc_4x4_blk(
595
2.31M
                ps_ctxt,
596
2.31M
                ps_cu_node->ps_sub_cu[(i * 2) + j],
597
2.31M
                pu1_src_4x4,
598
2.31M
                src_strd,
599
2.31M
                &ps_ctxt->au1_ref_8x8pu[i * 2 + j][0],
600
2.31M
                &ps_ctxt->au2_mode_bits_cost_8x8pu[i * 2 + j][0],
601
2.31M
                &ps_cu_node->ps_sub_cu[(i * 2) + j]->au4_best_cost_1tu[0],
602
2.31M
                &ps_cu_node->ps_sub_cu[(i * 2) + j]->au1_best_mode_1tu[0],
603
2.31M
                ps_func_selector);
604
605
            /*&au4_cost_4x4[i*2 + j][0],
606
                &au1_modes_4x4[i*2 + j][0]);*/ //TTODO : mode will change for the four partition
607
608
2.31M
            ihevce_set_nbr_map(
609
2.31M
                ps_ctxt->pu1_ctb_nbr_map,
610
2.31M
                ps_ctxt->i4_nbr_map_strd,
611
2.31M
                cu_pos_x + ((j) * (trans_size >> 2)),
612
2.31M
                cu_pos_y + ((i) * (trans_size >> 2)),
613
2.31M
                (trans_size >> 2),
614
2.31M
                1);
615
616
2.31M
            xA = ((ps_cu_node->ps_parent->u2_x0 << 3) >> 2) + 1 + j;
617
2.31M
            yA = ((ps_cu_node->ps_parent->u2_y0 << 3) >> 2) + 1 + i;
618
2.31M
            ps_ctxt->au1_ctb_mode_map[yA][xA] = ps_cu_node->ps_sub_cu[i * 2 + j]->best_mode;
619
2.31M
            ps_cu_node->ps_sub_cu[i * 2 + j]->u2_mode_bits_cost =
620
2.31M
                ps_ctxt->au2_mode_bits_8x8_pu[ps_cu_node->ps_sub_cu[i * 2 + j]->best_mode];
621
2.31M
        }
622
1.15M
    }
623
578k
}
624
625
/*!
626
******************************************************************************
627
* \if Function name : ihevce_bracketing_analysis \endif
628
*
629
* \brief
630
*    Interface function that evaluates MAX cu and MAX - 1 cu, with MAX cu size
631
*    info decided coarse resolution mode decision. Compares the SATD/SAD cost btwn
632
*    2 CUS and determines the actual CU size and best 3 modes to be given to rdopt
633
*
634
* \param[in] ps_ctxt : pointer to IPE context struct
635
* \param[in] ps_cu_node : pointer to cu node info buffer
636
* \param[in] ps_curr_src : pointer to src pixels struct
637
* \param[in] ps_ctb_out : pointer to ip ctb out struct
638
* \param[in] ps_row_cu : pointer to cu analyse struct
639
* \param[in] ps_ed_l1_ctb : pointer to level 1 early deci struct
640
* \param[in] ps_ed_l2_ctb : pointer to level 2 early deci struct
641
* \param[in] ps_l0_ipe_out_ctb : pointer to ipe_l0_ctb_analyse_for_me_t struct
642
*
643
* \return
644
*    None
645
*
646
* \author
647
*  Ittiam
648
*
649
*****************************************************************************
650
*/
651
void ihevce_bracketing_analysis(
652
    ihevce_ipe_ctxt_t *ps_ctxt,
653
    ihevce_ipe_cu_tree_t *ps_cu_node,
654
    iv_enc_yuv_buf_t *ps_curr_src,
655
    ctb_analyse_t *ps_ctb_out,
656
    //cu_analyse_t         *ps_row_cu,
657
    ihevce_ed_blk_t *ps_ed_l1_ctb,
658
    ihevce_ed_blk_t *ps_ed_l2_ctb,
659
    ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
660
    ipe_l0_ctb_analyse_for_me_t *ps_l0_ipe_out_ctb)
661
203k
{
662
203k
    WORD32 cu_pos_x = 0;
663
203k
    WORD32 cu_pos_y = 0;
664
665
203k
    UWORD8 u1_curr_ctb_wdt = ps_cu_node->u1_width;
666
203k
    UWORD8 u1_curr_ctb_hgt = ps_cu_node->u1_height;
667
203k
    WORD32 num_8x8_blks_x = (u1_curr_ctb_wdt >> 3);
668
203k
    WORD32 num_8x8_blks_y = (u1_curr_ctb_hgt >> 3);
669
670
203k
    ihevce_ed_blk_t *ps_ed_blk_l1 = ps_ed_l1_ctb;
671
203k
    ihevce_ed_blk_t *ps_ed_blk_l2 = ps_ed_l2_ctb;
672
673
203k
    WORD32 i;
674
203k
    WORD32 cand_mode_list[3];
675
    //cu_analyse_t *ps_curr_cu = ps_row_cu;
676
203k
    WORD32 blk_cnt = 0;
677
203k
    WORD32 j = 0;
678
203k
    WORD32 merge_32x32_l1, merge_32x32_l2;
679
680
203k
    WORD32 i4_skip_intra_eval_32x32_l1;
681
    //EIID: flag indicating number of 16x16 blocks to be skipped for intra evaluation within 32x32 block
682
683
203k
    WORD32 parent_cost = 0;
684
203k
    WORD32 child_cost[4] = { 0 };
685
203k
    WORD32 child_cost_least = 0;
686
203k
    WORD32 child_satd[4] = { 0 };
687
203k
    WORD32 x, y, size;
688
203k
    WORD32 merge_64x64 = 1;
689
203k
    UWORD8 au1_best_32x32_modes[4];
690
203k
    WORD32 au4_best_32x32_cost[4];
691
203k
    WORD32 parent_best_mode;
692
203k
    UWORD8 best_mode;
693
694
203k
    WORD32 i4_quality_preset = ps_ctxt->i4_quality_preset;
695
    /* flag to control 1CU-4TU modes based on quality preset                */
696
    /* if set 1CU-4TU are explicity evaluated else 1CU-1TU modes are copied */
697
203k
    WORD32 i4_enable_1cu_4tu = (i4_quality_preset == IHEVCE_QUALITY_P2) ||
698
176k
                               (i4_quality_preset == IHEVCE_QUALITY_P0);
699
700
    /* flag to control 4CU-16TU mode based on quality preset                */
701
    /* if set 4CU-16TU are explicity evaluated else 4CU-4TU modes are copied*/
702
203k
    WORD32 i4_enable_4cu_16tu = (i4_quality_preset == IHEVCE_QUALITY_P2) ||
703
176k
                                (i4_quality_preset == IHEVCE_QUALITY_P0);
704
705
203k
    WORD32 i4_mod_factor_num, i4_mod_factor_den = QP_MOD_FACTOR_DEN;  //2;
706
203k
    float f_strength;
707
    /* Accumalte satd */
708
203k
    LWORD64 i8_frame_acc_satd_cost = 0, i8_frame_acc_satd_by_modqp_q10 = 0;
709
203k
    WORD32 i4_ctb_acc_satd = 0;
710
711
    /* Accumalate Mode bits cost */
712
203k
    LWORD64 i8_frame_acc_mode_bits_cost = 0;
713
714
    /* Step2 is bypassed for parent, uses children modes*/
715
203k
    WORD32 step2_bypass = 1;
716
717
203k
    if(1 == ps_ctxt->u1_disable_child_cu_decide)
718
0
        step2_bypass = 0;
719
720
203k
    ps_cu_node->ps_parent = ps_ctxt->ps_ipe_cu_tree;
721
1.01M
    for(i = 0; i < 4; i++)
722
813k
    {
723
813k
        ps_cu_node->ps_sub_cu[i] = ps_ctxt->ps_ipe_cu_tree + 1 + i;
724
813k
    }
725
726
    /* Loop for all 8x8 block in a CTB */
727
203k
    ps_ctb_out->u4_cu_split_flags = 0x1;
728
729
    /* Initialize intra 64x64, 32x32 and 16x16 costs to max value */
730
1.01M
    for(i = 0; i < (MAX_CU_IN_CTB >> 4); i++)
731
813k
    {
732
813k
        ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i] = MAX_INTRA_COST_IPE;
733
813k
    }
734
735
3.45M
    for(i = 0; i < (MAX_CU_IN_CTB >> 2); i++)
736
3.25M
    {
737
3.25M
        ps_l0_ipe_out_ctb->ai4_best16x16_intra_cost[i] = MAX_INTRA_COST_IPE;
738
3.25M
    }
739
740
13.2M
    for(i = 0; i < (MAX_CU_IN_CTB); i++)
741
13.0M
    {
742
13.0M
        ps_l0_ipe_out_ctb->ai4_best8x8_intra_cost[i] = MAX_INTRA_COST_IPE;
743
13.0M
    }
744
745
203k
    ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = MAX_INTRA_COST_IPE;
746
747
    /* by default 64x64 modes are set to default values DC and Planar */
748
203k
    ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[0] = 0;
749
203k
    ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[1] = 1;
750
203k
    ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[2] = 255;
751
752
    /* by default 64x4 split is set to 1 */
753
203k
    ps_l0_ipe_out_ctb->u1_split_flag = 1;
754
755
    /* Modulation factor calculated based on spatial variance instead of hardcoded val*/
756
203k
    i4_mod_factor_num = ps_ctxt->ai4_mod_factor_derived_by_variance[1];  //16;
757
758
203k
    f_strength = ps_ctxt->f_strength;
759
760
    /* ------------------------------------------------ */
761
    /* populate the early decisions done by L1 analysis */
762
    /* ------------------------------------------------ */
763
3.45M
    for(i = 0; i < (MAX_CU_IN_CTB >> 2); i++)
764
3.25M
    {
765
3.25M
        ps_l0_ipe_out_ctb->ai4_best_sad_8x8_l1_ipe[i] = ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i];
766
3.25M
        ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_ipe[i] = ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i];
767
3.25M
        ps_l0_ipe_out_ctb->ai4_best_sad_8x8_l1_me[i] = ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i];
768
3.25M
        ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_me[i] = ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i];
769
3.25M
    }
770
771
    /* Init CTB level accumalated SATD and MPM bits */
772
203k
    ps_l0_ipe_out_ctb->i4_ctb_acc_satd = 0;
773
203k
    ps_l0_ipe_out_ctb->i4_ctb_acc_mpm_bits = 0;
774
775
    /* ------------------------------------------------ */
776
    /* Loop over all the blocks in current CTB          */
777
    /* ------------------------------------------------ */
778
203k
    {
779
        /* 64 8x8 blocks should be encountered for the do,while loop to exit */
780
203k
        do
781
2.05M
        {
782
2.05M
            intra32_analyse_t *ps_intra32_analyse;
783
2.05M
            intra16_analyse_t *ps_intra16_analyse;
784
2.05M
            WORD32 *pi4_intra_32_cost;
785
2.05M
            WORD32 *pi4_intra_16_cost;
786
2.05M
            WORD32 *pi4_intra_8_cost;
787
2.05M
            WORD32 merge_16x16_l1;
788
789
            /* Given the blk_cnt, get the CU's top-left 8x8 block's x and y positions within the CTB */
790
2.05M
            cu_pos_x = gau1_cu_pos_x[blk_cnt];
791
2.05M
            cu_pos_y = gau1_cu_pos_y[blk_cnt];
792
793
            /* default value for 32x32 best mode - blk_cnt increases by 16 for each 32x32 */
794
2.05M
            au1_best_32x32_modes[blk_cnt >> 4] = 255;
795
796
            /* get the corresponding intra 32 analyse pointer  use (blk_cnt / 16) */
797
            /* blk cnt is in terms of 8x8 units so a 32x32 will have 16 8x8 units */
798
2.05M
            ps_intra32_analyse = &ps_l0_ipe_out_ctb->as_intra32_analyse[blk_cnt >> 4];
799
800
            /* get the corresponding intra 16 analyse pointer use (blk_cnt & 0xF / 4)*/
801
            /* blk cnt is in terms of 8x8 units so a 16x16 will have 4 8x8 units */
802
2.05M
            ps_intra16_analyse = &ps_intra32_analyse->as_intra16_analyse[(blk_cnt & 0xF) >> 2];
803
804
            /* Line below assumes min_cu_size of 8 - checks whether CU starts are within picture */
805
2.05M
            if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
806
1.30M
            {
807
                /* Reset to zero for every cu decision */
808
1.30M
                merge_32x32_l1 = 0;
809
810
1.30M
                child_cost_least = 0;
811
812
                /* At L2, each 4x4 corresponds to 16x16 at L0. Every 4 16x16 stores a merge_success flag */
813
1.30M
                ps_ed_blk_l2 = ps_ed_l2_ctb + (blk_cnt >> 2);
814
815
1.30M
                pi4_intra_32_cost = &ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[blk_cnt >> 4];
816
817
                /* by default 32x32 modes are set to default values DC and Planar */
818
1.30M
                ps_intra32_analyse->au1_best_modes_32x32_tu[0] = 0;
819
1.30M
                ps_intra32_analyse->au1_best_modes_32x32_tu[1] = 1;
820
1.30M
                ps_intra32_analyse->au1_best_modes_32x32_tu[2] = 255;
821
822
                /* By default 32x32 split is set to 1 */
823
1.30M
                ps_intra32_analyse->b1_split_flag = 1;
824
825
1.30M
                ps_intra32_analyse->au1_best_modes_16x16_tu[0] = 0;
826
1.30M
                ps_intra32_analyse->au1_best_modes_16x16_tu[1] = 1;
827
1.30M
                ps_intra32_analyse->au1_best_modes_16x16_tu[2] = 255;
828
829
                /* 16x16 cost & 8x8 cost are stored in Raster scan order */
830
                /* stride of 16x16 buffer is MAX_CU_IN_CTB_ROW >> 1      */
831
                /* stride of 8x8 buffer is MAX_CU_IN_CTB_ROW             */
832
1.30M
                {
833
1.30M
                    WORD32 pos_x_8x8, pos_y_8x8;
834
835
1.30M
                    pos_x_8x8 = gau1_cu_pos_x[blk_cnt];
836
1.30M
                    pos_y_8x8 = gau1_cu_pos_y[blk_cnt];
837
838
1.30M
                    pi4_intra_16_cost = &ps_l0_ipe_out_ctb->ai4_best16x16_intra_cost[0];
839
840
1.30M
                    pi4_intra_16_cost +=
841
1.30M
                        ((pos_x_8x8 >> 1) + ((pos_y_8x8 >> 1) * (MAX_CU_IN_CTB_ROW >> 1)));
842
843
1.30M
                    pi4_intra_8_cost = &ps_l0_ipe_out_ctb->ai4_best8x8_intra_cost[0];
844
845
1.30M
                    pi4_intra_8_cost += (pos_x_8x8 + (pos_y_8x8 * MAX_CU_IN_CTB_ROW));
846
1.30M
                }
847
848
1.30M
                merge_32x32_l1 = 0;
849
1.30M
                merge_32x32_l2 = 0;
850
1.30M
                i4_skip_intra_eval_32x32_l1 = 0;
851
852
                /* Enable 16x16 merge iff sufficient 8x8 blocks remain in the current CTB */
853
1.30M
                merge_16x16_l1 = 0;
854
1.30M
                if(((num_8x8_blks_x - cu_pos_x) >= 2) && ((num_8x8_blks_y - cu_pos_y) >= 2))
855
1.23M
                {
856
1.23M
#if !ENABLE_UNIFORM_CU_SIZE_8x8
857
1.23M
                    merge_16x16_l1 = ps_ed_blk_l1->merge_success;
858
#else
859
                    merge_16x16_l1 = 0;
860
#endif
861
1.23M
                }
862
863
                /* Enable 32x32 merge iff sufficient 8x8 blocks remain in the current CTB */
864
1.30M
                if(((num_8x8_blks_x - cu_pos_x) >= 4) && ((num_8x8_blks_y - cu_pos_y) >= 4))
865
997k
                {
866
                    /* Check 4 flags of L1(8x8) say merge */
867
4.98M
                    for(i = 0; i < 4; i++)
868
3.99M
                    {
869
3.99M
                        merge_32x32_l1 += (ps_ed_blk_l1 + (i * 4))->merge_success;
870
871
                        //EIDD: num 16x16 blocks for which inter_intra flag says eval only inter, i.e. skip intra eval
872
3.99M
                        i4_skip_intra_eval_32x32_l1 +=
873
3.99M
                            ((ps_ed_blk_l1 + (i * 4))->intra_or_inter == 2) ? 1 : 0;
874
3.99M
                    }
875
876
997k
#if !ENABLE_UNIFORM_CU_SIZE_8x8
877
                    /* Check 1 flag from L2(16x16) say merge */
878
997k
                    merge_32x32_l2 = ps_ed_blk_l2->merge_success;
879
#else
880
                    merge_32x32_l1 = 0;
881
                    merge_32x32_l2 = 0;
882
#endif
883
997k
                }
884
885
1.30M
#if DISABLE_L2_IPE_IN_PB_L1_IN_B
886
1.30M
                if((i4_quality_preset == IHEVCE_QUALITY_P6) && (ps_ctxt->i4_slice_type != ISLICE))
887
219k
                {
888
219k
                    merge_32x32_l2 = 0;
889
219k
                    ps_ed_blk_l2->merge_success = 0;
890
219k
                }
891
1.30M
#endif
892
893
1.30M
                ps_intra32_analyse->b1_valid_cu = 1;
894
895
                /* If Merge success from all 4 L1 and L2, max CU size 32x32 is chosen */
896
                /* EIID: if all blocks to be skipped then skip entire 32x32 for intra eval,
897
                if no blocks to be skipped then eval entire 32x32,
898
                else break the merge and go to 16x16 level eval */
899
1.30M
                if((merge_32x32_l1 == 4) && merge_32x32_l2 &&
900
595k
                   ((i4_skip_intra_eval_32x32_l1 == 0) ||
901
40.3k
                    (i4_skip_intra_eval_32x32_l1 == 4))  //comment this line to disable break-merge
902
1.30M
                )
903
587k
                {
904
#if IP_DBG_L1_l2
905
                    /* Populate params for 32x32 block analysis */
906
                    ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
907
908
                    ps_cu_node->ps_parent->u1_cu_size = 32;
909
                    ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[blk_cnt]; /* Populate properly */
910
                    ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[blk_cnt]; /* Populate properly */
911
                    ps_cu_node->ps_parent->best_mode = ps_ed_blk_l2->best_merge_mode;
912
                    /* CU size 32x32 and fill the final cu params */
913
914
                    ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
915
916
                    /* Increment pointers */
917
                    ps_ed_blk_l1 += 16;
918
                    blk_cnt += 16;
919
                    ps_row_cu++;
920
                    merge_64x64 &= 1;
921
#else
922
923
                    /* EIID: dont evaluate if all 4 blocks at L1 said inter is winning*/
924
587k
                    if(4 == i4_skip_intra_eval_32x32_l1 && (ps_ctxt->i4_slice_type != ISLICE))
925
32.4k
                    {
926
32.4k
                        WORD32 i4_local_ctr1, i4_local_ctr2;
927
928
32.4k
                        ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
929
930
32.4k
                        ps_cu_node->ps_parent->u1_cu_size = 32;
931
32.4k
                        ps_cu_node->ps_parent->u2_x0 =
932
32.4k
                            gau1_cu_pos_x[blk_cnt]; /* Populate properly */
933
32.4k
                        ps_cu_node->ps_parent->u2_y0 =
934
32.4k
                            gau1_cu_pos_y[blk_cnt]; /* Populate properly */
935
32.4k
                        ps_cu_node->ps_parent->best_mode =
936
32.4k
                            INTRA_DC;  //ps_ed_blk_l2->best_merge_mode;
937
                        /* CU size 32x32 and fill the final cu params */
938
939
                        /* fill in the first modes as invalid */
940
32.4k
                        ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
941
32.4k
                        ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
942
32.4k
                            INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
943
32.4k
                        ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
944
945
32.4k
                        ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
946
32.4k
                        ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
947
32.4k
                        ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
948
949
32.4k
                        ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
950
951
                        //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
952
                        //ps_row_cu->u1_num_intra_rdopt_cands = 0;
953
954
32.4k
                        ps_intra32_analyse->b1_valid_cu = 0;
955
32.4k
                        ps_intra32_analyse->b1_split_flag = 0;
956
32.4k
                        ps_intra32_analyse->b1_merge_flag = 0;
957
                        /*memset (&ps_intra32_analyse->au1_best_modes_32x32_tu,
958
                        255,
959
                        NUM_BEST_MODES);
960
                        memset (&ps_intra32_analyse->au1_best_modes_16x16_tu,
961
                        255,
962
                        NUM_BEST_MODES);*/
963
                        //set only first mode since if it's 255. it wont go ahead
964
32.4k
                        ps_intra32_analyse->au1_best_modes_32x32_tu[0] = 255;
965
32.4k
                        ps_intra32_analyse->au1_best_modes_16x16_tu[0] = 255;
966
967
32.4k
                        *pi4_intra_32_cost = MAX_INTRA_COST_IPE;
968
969
                        /*since ME will start evaluating from bottom up, set the lower
970
                        cu size data invalid */
971
162k
                        for(i4_local_ctr1 = 0; i4_local_ctr1 < 4; i4_local_ctr1++)
972
129k
                        {
973
129k
                            WORD32 *pi4_intra_8_cost_curr16;
974
975
129k
                            ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
976
129k
                                .au1_best_modes_16x16_tu[0] = 255;
977
129k
                            ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
978
129k
                                .au1_best_modes_8x8_tu[0] = 255;
979
129k
                            ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_merge_flag = 0;
980
129k
                            ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_valid_cu = 0;
981
129k
                            ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_split_flag = 0;
982
983
129k
                            pi4_intra_16_cost
984
129k
                                [(i4_local_ctr1 & 1) + ((MAX_CU_IN_CTB_ROW >> 1) *
985
129k
                                                        (i4_local_ctr1 >> 1))] = MAX_INTRA_COST_IPE;
986
987
129k
                            pi4_intra_8_cost_curr16 = pi4_intra_8_cost + ((i4_local_ctr1 & 1) << 1);
988
129k
                            pi4_intra_8_cost_curr16 +=
989
129k
                                ((i4_local_ctr1 >> 1) << 1) * MAX_CU_IN_CTB_ROW;
990
991
649k
                            for(i4_local_ctr2 = 0; i4_local_ctr2 < 4; i4_local_ctr2++)
992
519k
                            {
993
519k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
994
519k
                                    .as_intra8_analyse[i4_local_ctr2]
995
519k
                                    .au1_4x4_best_modes[0][0] = 255;
996
519k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
997
519k
                                    .as_intra8_analyse[i4_local_ctr2]
998
519k
                                    .au1_4x4_best_modes[1][0] = 255;
999
519k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1000
519k
                                    .as_intra8_analyse[i4_local_ctr2]
1001
519k
                                    .au1_4x4_best_modes[2][0] = 255;
1002
519k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1003
519k
                                    .as_intra8_analyse[i4_local_ctr2]
1004
519k
                                    .au1_4x4_best_modes[3][0] = 255;
1005
519k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1006
519k
                                    .as_intra8_analyse[i4_local_ctr2]
1007
519k
                                    .au1_best_modes_8x8_tu[0] = 255;
1008
519k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1009
519k
                                    .as_intra8_analyse[i4_local_ctr2]
1010
519k
                                    .au1_best_modes_4x4_tu[0] = 255;
1011
519k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1012
519k
                                    .as_intra8_analyse[i4_local_ctr2]
1013
519k
                                    .b1_valid_cu = 0;
1014
1015
519k
                                pi4_intra_8_cost_curr16
1016
519k
                                    [(i4_local_ctr2 & 1) +
1017
519k
                                     (MAX_CU_IN_CTB_ROW * (i4_local_ctr2 >> 1))] =
1018
519k
                                        MAX_INTRA_COST_IPE;
1019
519k
                            }
1020
129k
                        }
1021
1022
                        /* set neighbours even if intra is not evaluated, since source is always available. */
1023
32.4k
                        ihevce_set_nbr_map(
1024
32.4k
                            ps_ctxt->pu1_ctb_nbr_map,
1025
32.4k
                            ps_ctxt->i4_nbr_map_strd,
1026
32.4k
                            ps_cu_node->ps_parent->u2_x0 << 1,
1027
32.4k
                            ps_cu_node->ps_parent->u2_y0 << 1,
1028
32.4k
                            (ps_cu_node->ps_parent->u1_cu_size >> 2),
1029
32.4k
                            1);
1030
1031
                        /* cost accumalation of best cu size candiate */
1032
                        /*i8_frame_acc_satd_cost += parent_cost;*/
1033
1034
                        /* Mode bits cost accumalation for best cu size and cu mode */
1035
                        /*i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;*/
1036
1037
                        /*satd/mod_qp accumulation of best cu */
1038
                        /*i8_frame_acc_satd_by_modqp_q10 += ((LWORD64)ps_cu_node->ps_parent->best_satd << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3))/i4_q_scale_q3_mod;*/
1039
1040
                        /* Increment pointers */
1041
32.4k
                        ps_ed_blk_l1 += 16;
1042
32.4k
                        blk_cnt += 16;
1043
                        //ps_row_cu++;
1044
32.4k
                        merge_64x64 = 0;
1045
1046
                        /* increment for stat purpose only. Increment is valid only on single thread */
1047
32.4k
                        ps_ctxt->u4_num_16x16_skips_at_L0_IPE += 4;
1048
32.4k
                    }
1049
555k
                    else
1050
555k
                    {
1051
                        /* Revaluation of 4 16x16 blocks at 8x8 prediction level */
1052
                        //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1053
1054
555k
                        if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
1055
40.8k
                           (ps_ctxt->i4_slice_type == PSLICE))
1056
0
                        {
1057
0
                            ps_ctxt->u1_disable_child_cu_decide = 1;
1058
0
                            step2_bypass = 0;
1059
0
                        }
1060
1061
                        /* Based on the flag, Child modes decision can be disabled*/
1062
555k
                        if(0 == ps_ctxt->u1_disable_child_cu_decide)
1063
555k
                        {
1064
2.77M
                            for(j = 0; j < 4; j++)
1065
2.22M
                            {
1066
2.22M
                                ps_cu_node->ps_sub_cu[j]->u2_x0 =
1067
2.22M
                                    gau1_cu_pos_x[blk_cnt + (j * 4)]; /* Populate properly */
1068
2.22M
                                ps_cu_node->ps_sub_cu[j]->u2_y0 =
1069
2.22M
                                    gau1_cu_pos_y[blk_cnt + (j * 4)]; /* Populate properly */
1070
2.22M
                                ps_cu_node->ps_sub_cu[j]->u1_cu_size = 16;
1071
1072
2.22M
                                {
1073
2.22M
                                    WORD32 best_ang_mode =
1074
2.22M
                                        (ps_ed_blk_l1 + (j * 4))->best_merge_mode;
1075
1076
2.22M
                                    if(best_ang_mode < 2)
1077
2.14M
                                        best_ang_mode = 26;
1078
1079
2.22M
                                    ihevce_mode_eval_filtering(
1080
2.22M
                                        ps_cu_node->ps_sub_cu[j],
1081
2.22M
                                        ps_cu_node,
1082
2.22M
                                        ps_ctxt,
1083
2.22M
                                        ps_curr_src,
1084
2.22M
                                        best_ang_mode,
1085
2.22M
                                        &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1086
2.22M
                                        &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1087
2.22M
                                        !step2_bypass,
1088
2.22M
                                        1);
1089
1090
2.22M
                                    if(i4_enable_4cu_16tu)
1091
1.29M
                                    {
1092
1.29M
                                        ihevce_mode_eval_filtering(
1093
1.29M
                                            ps_cu_node->ps_sub_cu[j],
1094
1.29M
                                            ps_cu_node,
1095
1.29M
                                            ps_ctxt,
1096
1.29M
                                            ps_curr_src,
1097
1.29M
                                            best_ang_mode,
1098
1.29M
                                            &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1099
1.29M
                                            &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1100
1.29M
                                            !step2_bypass,
1101
1.29M
                                            0);
1102
1.29M
                                    }
1103
922k
                                    else
1104
922k
                                    {
1105
                                        /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1106
922k
                                        memcpy(
1107
922k
                                            &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1108
922k
                                            &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1109
922k
                                            NUM_BEST_MODES);
1110
1111
                                        /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1112
922k
                                        memcpy(
1113
922k
                                            &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1114
922k
                                            &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1115
922k
                                            NUM_BEST_MODES * sizeof(WORD32));
1116
922k
                                    }
1117
1118
2.22M
                                    child_cost[j] =
1119
2.22M
                                        MIN(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1120
2.22M
                                            ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0]);
1121
1122
                                    /* Child cost is sum of costs at 16x16 level  */
1123
2.22M
                                    child_cost_least += child_cost[j];
1124
1125
                                    /* Select the best mode to be populated as top and left nbr depending on the
1126
                                    4tu and 1tu cost */
1127
2.22M
                                    if(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0] >
1128
2.22M
                                       ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0])
1129
5.70k
                                    {
1130
5.70k
                                        ps_cu_node->ps_sub_cu[j]->best_mode =
1131
5.70k
                                            ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
1132
5.70k
                                    }
1133
2.21M
                                    else
1134
2.21M
                                    {
1135
2.21M
                                        ps_cu_node->ps_sub_cu[j]->best_mode =
1136
2.21M
                                            ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
1137
2.21M
                                    }
1138
1139
2.22M
                                    { /* Update the CTB nodes only for MAX - 1 CU nodes */
1140
2.22M
                                        WORD32 xA, yA, row, col;
1141
2.22M
                                        xA = ((ps_cu_node->ps_sub_cu[j]->u2_x0 << 3) >> 2) + 1;
1142
2.22M
                                        yA = ((ps_cu_node->ps_sub_cu[j]->u2_y0 << 3) >> 2) + 1;
1143
2.22M
                                        size = ps_cu_node->ps_sub_cu[j]->u1_cu_size >> 2;
1144
11.1M
                                        for(row = yA; row < (yA + size); row++)
1145
8.88M
                                        {
1146
44.4M
                                            for(col = xA; col < (xA + size); col++)
1147
35.5M
                                            {
1148
35.5M
                                                ps_ctxt->au1_ctb_mode_map[row][col] =
1149
35.5M
                                                    ps_cu_node->ps_sub_cu[j]->best_mode;
1150
35.5M
                                            }
1151
8.88M
                                        }
1152
2.22M
                                    }
1153
2.22M
                                }
1154
1155
                                /*Child SATD cost*/
1156
2.22M
                                child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
1157
1158
                                /* store the child 16x16 costs */
1159
2.22M
                                pi4_intra_16_cost[(j & 1) + ((MAX_CU_IN_CTB_ROW >> 1) * (j >> 1))] =
1160
2.22M
                                    child_cost[j];
1161
1162
                                /* set the CU valid flag */
1163
2.22M
                                ps_intra16_analyse[j].b1_valid_cu = 1;
1164
1165
                                /* All 16x16 merge is valid, if Cu 32x32 is chosen */
1166
                                /* To be reset, if CU 64x64 is chosen */
1167
2.22M
                                ps_intra16_analyse[j].b1_merge_flag = 1;
1168
1169
                                /* storing the modes to intra 16 analyse */
1170
                                /* store the best 16x16 modes 8x8 tu */
1171
2.22M
                                memcpy(
1172
2.22M
                                    &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1173
2.22M
                                    &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1174
2.22M
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
1175
2.22M
                                ps_intra16_analyse[j].au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1176
1177
                                /* store the best 16x16 modes 16x16 tu */
1178
2.22M
                                memcpy(
1179
2.22M
                                    &ps_intra16_analyse[j].au1_best_modes_16x16_tu[0],
1180
2.22M
                                    &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1181
2.22M
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
1182
2.22M
                                ps_intra16_analyse[j].au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1183
1184
                                /* divide the 16x16 costs (pro rating) to 4 8x8 costs */
1185
                                /* store the same 16x16 modes as 4 8x8 child modes    */
1186
2.22M
                                {
1187
2.22M
                                    WORD32 idx_8x8;
1188
2.22M
                                    WORD32 *pi4_intra_8_cost_curr16;
1189
2.22M
                                    intra8_analyse_t *ps_intra8_analyse;
1190
1191
2.22M
                                    pi4_intra_8_cost_curr16 = pi4_intra_8_cost + ((j & 1) << 1);
1192
2.22M
                                    pi4_intra_8_cost_curr16 += ((j >> 1) << 1) * MAX_CU_IN_CTB_ROW;
1193
1194
11.1M
                                    for(idx_8x8 = 0; idx_8x8 < 4; idx_8x8++)
1195
8.88M
                                    {
1196
8.88M
                                        pi4_intra_8_cost_curr16
1197
8.88M
                                            [(idx_8x8 & 1) + (MAX_CU_IN_CTB_ROW * (idx_8x8 >> 1))] =
1198
8.88M
                                                (child_cost[j] + 3) >> 2;
1199
1200
8.88M
                                        ps_intra8_analyse =
1201
8.88M
                                            &ps_intra16_analyse[j].as_intra8_analyse[idx_8x8];
1202
1203
8.88M
                                        ps_intra8_analyse->b1_enable_nxn = 0;
1204
8.88M
                                        ps_intra8_analyse->b1_valid_cu = 1;
1205
1206
                                        /* store the best 8x8 modes 8x8 tu */
1207
8.88M
                                        memcpy(
1208
8.88M
                                            &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
1209
8.88M
                                            &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1210
8.88M
                                            sizeof(UWORD8) * (NUM_BEST_MODES + 1));
1211
1212
                                        /* store the best 8x8 modes 4x4 tu */
1213
8.88M
                                        memcpy(
1214
8.88M
                                            &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
1215
8.88M
                                            &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1216
8.88M
                                            sizeof(UWORD8) * (NUM_BEST_MODES + 1));
1217
1218
                                        /* NXN modes not evaluated hence set to 0 */
1219
8.88M
                                        memset(
1220
8.88M
                                            &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1221
8.88M
                                            255,
1222
8.88M
                                            sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1223
8.88M
                                    }
1224
2.22M
                                }
1225
2.22M
                            }
1226
1227
555k
                            ihevce_set_nbr_map(
1228
555k
                                ps_ctxt->pu1_ctb_nbr_map,
1229
555k
                                ps_ctxt->i4_nbr_map_strd,
1230
555k
                                ps_cu_node->ps_sub_cu[0]->u2_x0 << 1,
1231
555k
                                ps_cu_node->ps_sub_cu[0]->u2_y0 << 1,
1232
555k
                                (ps_cu_node->ps_sub_cu[0]->u1_cu_size >> 1),
1233
555k
                                0);
1234
555k
                        }
1235
0
#if 1  //DISBLE_CHILD_CU_EVAL_L0_IPE //1
1236
0
                        else
1237
0
                        {
1238
0
                            for(j = 0; j < 4; j++)
1239
0
                            {
1240
0
                                WORD32 idx_8x8;
1241
0
                                intra8_analyse_t *ps_intra8_analyse;
1242
0
                                ps_intra16_analyse[j].au1_best_modes_8x8_tu[0] = 255;
1243
0
                                ps_intra16_analyse[j].au1_best_modes_16x16_tu[0] = 255;
1244
1245
0
                                ps_intra16_analyse[j].b1_valid_cu = 0;
1246
1247
0
                                for(idx_8x8 = 0; idx_8x8 < 4; idx_8x8++)
1248
0
                                {
1249
0
                                    ps_intra8_analyse =
1250
0
                                        &ps_intra16_analyse[j].as_intra8_analyse[idx_8x8];
1251
1252
0
                                    ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
1253
0
                                    ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
1254
1255
0
                                    ps_intra8_analyse->b1_enable_nxn = 0;
1256
0
                                    ps_intra8_analyse->b1_valid_cu = 0;
1257
1258
                                    /* NXN modes not evaluated hence set to 0 */
1259
0
                                    memset(
1260
0
                                        &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1261
0
                                        255,
1262
0
                                        sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1263
0
                                }
1264
0
                            }
1265
1266
0
                            child_cost_least = MAX_INTRA_COST_IPE;
1267
0
                        }
1268
555k
#endif
1269
1270
                        /* Populate params for 32x32 block analysis */
1271
1272
555k
                        ps_cu_node->ps_parent->u1_cu_size = 32;
1273
555k
                        ps_cu_node->ps_parent->u2_x0 =
1274
555k
                            gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1275
555k
                        ps_cu_node->ps_parent->u2_y0 =
1276
555k
                            gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1277
1278
                        /* Revaluation for 32x32 parent block at 16x16 prediction level */
1279
                        //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1280
1281
555k
                        {
1282
                            /* Eval for TUSize = CuSize */
1283
555k
                            ihevce_mode_eval_filtering(
1284
555k
                                ps_cu_node->ps_parent,
1285
555k
                                ps_cu_node,
1286
555k
                                ps_ctxt,
1287
555k
                                ps_curr_src,
1288
555k
                                26,
1289
555k
                                &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1290
555k
                                &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1291
555k
                                step2_bypass,
1292
555k
                                1);
1293
1294
555k
                            if(i4_enable_1cu_4tu)
1295
324k
                            {
1296
                                /* Eval for TUSize = CuSize/2 */
1297
324k
                                ihevce_mode_eval_filtering(
1298
324k
                                    ps_cu_node->ps_parent,
1299
324k
                                    ps_cu_node,
1300
324k
                                    ps_ctxt,
1301
324k
                                    ps_curr_src,
1302
324k
                                    26,
1303
324k
                                    &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1304
324k
                                    &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1305
324k
                                    step2_bypass,
1306
324k
                                    0);
1307
324k
                            }
1308
230k
                            else
1309
230k
                            {
1310
                                /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1311
230k
                                memcpy(
1312
230k
                                    &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1313
230k
                                    &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1314
230k
                                    NUM_BEST_MODES);
1315
1316
                                /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1317
230k
                                memcpy(
1318
230k
                                    &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1319
230k
                                    &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1320
230k
                                    NUM_BEST_MODES * sizeof(WORD32));
1321
230k
                            }
1322
555k
                        }
1323
1324
555k
                        ps_ctxt->u1_disable_child_cu_decide = 0;
1325
555k
                        step2_bypass = 1;
1326
1327
                        /* Update parent cost */
1328
555k
                        parent_cost =
1329
555k
                            MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1330
555k
                                ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
1331
1332
                        /* Select the best mode to be populated as top and left nbr depending on the
1333
                        4tu and 1tu cost */
1334
555k
                        if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
1335
555k
                           ps_cu_node->ps_parent->au4_best_cost_1tu[0])
1336
2.50k
                        {
1337
2.50k
                            ps_cu_node->ps_parent->best_mode =
1338
2.50k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1339
2.50k
                        }
1340
552k
                        else
1341
552k
                        {
1342
552k
                            ps_cu_node->ps_parent->best_mode =
1343
552k
                                ps_cu_node->ps_parent->au1_best_mode_4tu[0];
1344
552k
                        }
1345
1346
                        /* store the 32x32 cost */
1347
555k
                        *pi4_intra_32_cost = parent_cost;
1348
1349
                        /* set the CU valid flag */
1350
555k
                        ps_intra32_analyse->b1_valid_cu = 1;
1351
1352
555k
                        ps_intra32_analyse->b1_merge_flag = 1;
1353
1354
                        /* storing the modes to intra 32 analyse */
1355
555k
                        {
1356
                            /* store the best 32x32 modes 16x16 tu */
1357
555k
                            memcpy(
1358
555k
                                &ps_intra32_analyse->au1_best_modes_16x16_tu[0],
1359
555k
                                &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1360
555k
                                sizeof(UWORD8) * (NUM_BEST_MODES));
1361
555k
                            ps_intra32_analyse->au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1362
1363
                            /* store the best 32x32 modes 32x32 tu */
1364
555k
                            memcpy(
1365
555k
                                &ps_intra32_analyse->au1_best_modes_32x32_tu[0],
1366
555k
                                &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1367
555k
                                sizeof(UWORD8) * (NUM_BEST_MODES));
1368
555k
                            ps_intra32_analyse->au1_best_modes_32x32_tu[NUM_BEST_MODES] = 255;
1369
555k
                        }
1370
555k
                        parent_best_mode = ps_cu_node->ps_parent->best_mode;
1371
555k
                        if((parent_cost <=
1372
555k
                            child_cost_least + (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >>
1373
555k
                                                LAMBDA_Q_SHIFT)))  //|| identical_modes)
1374
504k
                        {
1375
504k
                            WORD32 i4_q_scale_q3_mod;
1376
504k
                            UWORD8 u1_cu_possible_qp;
1377
504k
                            WORD32 i4_act_factor;
1378
1379
                            /* CU size 32x32 and fill the final cu params */
1380
1381
504k
                            ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1382
1383
504k
                            if((IHEVCE_QUALITY_P3 > i4_quality_preset))
1384
295k
                            {
1385
1.47M
                                for(i = 0; i < 4; i++)
1386
1.18M
                                {
1387
1.18M
                                    intra8_analyse_t *ps_intra8_analyse;
1388
1.18M
                                    ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
1389
5.91M
                                    for(j = 0; j < 4; j++)
1390
4.72M
                                    {
1391
                                        /* Populate best 3 nxn modes */
1392
4.72M
                                        ps_intra8_analyse->au1_4x4_best_modes[j][0] =
1393
4.72M
                                            ps_cu_node->ps_sub_cu[i]->au1_best_mode_4tu[0];
1394
4.72M
                                        ps_intra8_analyse->au1_4x4_best_modes[j][1] =
1395
4.72M
                                            ps_cu_node->ps_sub_cu[i]
1396
4.72M
                                                ->au1_best_mode_4tu[1];  //(ps_ed + 1)->best_mode;
1397
4.72M
                                        ps_intra8_analyse->au1_4x4_best_modes[j][2] =
1398
4.72M
                                            ps_cu_node->ps_sub_cu[i]
1399
4.72M
                                                ->au1_best_mode_4tu[2];  //(ps_ed + 2)->best_mode;
1400
4.72M
                                        ps_intra8_analyse->au1_4x4_best_modes[j][3] = 255;
1401
4.72M
                                    }
1402
1.18M
                                }
1403
295k
                            }
1404
                            /* store the 32x32 non split flag */
1405
504k
                            ps_intra32_analyse->b1_split_flag = 0;
1406
504k
                            ps_intra32_analyse->as_intra16_analyse[0].b1_split_flag = 0;
1407
504k
                            ps_intra32_analyse->as_intra16_analyse[1].b1_split_flag = 0;
1408
504k
                            ps_intra32_analyse->as_intra16_analyse[2].b1_split_flag = 0;
1409
504k
                            ps_intra32_analyse->as_intra16_analyse[3].b1_split_flag = 0;
1410
1411
504k
                            au1_best_32x32_modes[blk_cnt >> 4] =
1412
504k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1413
1414
504k
                            au4_best_32x32_cost[blk_cnt >> 4] =
1415
504k
                                ps_cu_node->ps_parent->au4_best_cost_1tu[0];
1416
                            /*As 32*32 has won, pick L2 8x8 qp which maps
1417
                            to L0 32x32 Qp*/
1418
504k
                            ASSERT(((blk_cnt >> 4) & 3) == (blk_cnt >> 4));
1419
504k
                            ASSERT(ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0] != -2);
1420
504k
                            u1_cu_possible_qp = ihevce_cu_level_qp_mod(
1421
504k
                                ps_ctxt->i4_qscale,
1422
504k
                                ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0],
1423
504k
                                ps_ctxt->ld_curr_frame_16x16_log_avg[0],
1424
504k
                                f_strength,
1425
504k
                                &i4_act_factor,
1426
504k
                                &i4_q_scale_q3_mod,
1427
504k
                                ps_ctxt->ps_rc_quant_ctxt);
1428
                            /* cost accumalation of best cu size candiate */
1429
504k
                            i8_frame_acc_satd_cost += parent_cost;
1430
1431
                            /* satd and mpm bits accumalation of best cu size candiate */
1432
504k
                            i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
1433
1434
                            /* Mode bits cost accumalation for best cu size and cu mode */
1435
504k
                            i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;
1436
1437
                            /*satd/mod_qp accumulation of best cu */
1438
504k
                            i8_frame_acc_satd_by_modqp_q10 +=
1439
504k
                                ((LWORD64)ps_cu_node->ps_parent->best_satd
1440
504k
                                 << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1441
504k
                                i4_q_scale_q3_mod;
1442
1443
                            /* Increment pointers */
1444
504k
                            ps_ed_blk_l1 += 16;
1445
504k
                            blk_cnt += 16;
1446
                            //ps_row_cu++;
1447
504k
                            merge_64x64 &= 1;
1448
504k
                        }
1449
50.6k
                        else
1450
50.6k
                        {
1451
                            /* store the 32x32 split flag */
1452
50.6k
                            ps_intra32_analyse->b1_split_flag = 1;
1453
1454
                            /* CU size 16x16 and fill the final cu params for all 4 blocks */
1455
253k
                            for(j = 0; j < 4; j++)
1456
202k
                            {
1457
202k
                                WORD32 i4_q_scale_q3_mod;
1458
202k
                                UWORD8 u1_cu_possible_qp;
1459
202k
                                WORD32 i4_act_factor;
1460
1461
                                /* Set CU split flag */
1462
202k
                                ASSERT(blk_cnt % 4 == 0);
1463
1464
202k
                                ihevce_update_cand_list(
1465
202k
                                    ps_cu_node->ps_sub_cu[j], ps_ed_blk_l1, ps_ctxt);
1466
1467
                                /* store the 16x16 non split flag  */
1468
202k
                                ps_intra16_analyse[j].b1_split_flag = 0;
1469
1470
202k
                                ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1471
202k
                                ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0] != -2);
1472
                                /*As 16*16 has won, pick L1 8x8 qp which maps
1473
                                to L0 16x16 Qp*/
1474
202k
                                u1_cu_possible_qp = ihevce_cu_level_qp_mod(
1475
202k
                                    ps_ctxt->i4_qscale,
1476
202k
                                    ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0],
1477
202k
                                    ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1478
202k
                                    f_strength,
1479
202k
                                    &i4_act_factor,
1480
202k
                                    &i4_q_scale_q3_mod,
1481
202k
                                    ps_ctxt->ps_rc_quant_ctxt);
1482
1483
                                /*accum satd/qp for all child block*/
1484
202k
                                i8_frame_acc_satd_by_modqp_q10 +=
1485
202k
                                    ((LWORD64)child_satd[j]
1486
202k
                                     << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1487
202k
                                    i4_q_scale_q3_mod;
1488
1489
                                /* Accumalate mode bits for all child blocks */
1490
202k
                                i8_frame_acc_mode_bits_cost +=
1491
202k
                                    ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
1492
1493
                                /* satd and mpm bits accumalation of best cu size candiate */
1494
202k
                                i4_ctb_acc_satd += child_satd[j];
1495
1496
                                /* Increment pointers */
1497
                                //ps_row_cu++;
1498
202k
                                ps_ed_blk_l1 += 4;
1499
202k
                                blk_cnt += 4;
1500
202k
                            }
1501
1502
                            /* cost accumalation of best cu size candiate */
1503
50.6k
                            i8_frame_acc_satd_cost += child_cost_least;
1504
1505
                            /* 64x64 merge is not possible */
1506
50.6k
                            merge_64x64 = 0;
1507
50.6k
                        }
1508
1509
                        //ps_ed_blk_l2 += 4;
1510
1511
555k
                    }  //end of EIID's else
1512
587k
#endif
1513
587k
                }
1514
                /* If Merge success for L1 max CU size 16x16 is chosen */
1515
714k
                else if(merge_16x16_l1)
1516
506k
                {
1517
#if IP_DBG_L1_l2
1518
                    ps_cu_node->ps_parent->u1_cu_size = 16;
1519
                    ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1520
                    ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1521
                    ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_merge_mode;
1522
                    ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1523
1524
                    blk_cnt += 4;
1525
                    ps_ed_blk_l1 += 4;
1526
                    ps_row_cu++;
1527
                    merge_64x64 = 0;
1528
#else
1529
1530
                    /*EIID: evaluate only if L1 early-inter-intra decision is not favouring inter*/
1531
                    /* enable this only in B pictures */
1532
506k
                    if(ps_ed_blk_l1->intra_or_inter == 2 && (ps_ctxt->i4_slice_type != ISLICE))
1533
144k
                    {
1534
144k
                        WORD32 i4_q_scale_q3_mod, i4_local_ctr;
1535
144k
                        WORD8 i1_cu_possible_qp;
1536
144k
                        WORD32 i4_act_factor;
1537
                        /* make cost infinity. */
1538
                        /* make modes invalid */
1539
                        /* update loop variables */
1540
                        /* set other output variales */
1541
                        /* dont set neighbour flag so that next blocks wont access this cu */
1542
                        /* what happens to ctb_mode_map?? */
1543
1544
144k
                        ps_cu_node->ps_parent->u1_cu_size = 16;
1545
144k
                        ps_cu_node->ps_parent->u2_x0 =
1546
144k
                            gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1547
144k
                        ps_cu_node->ps_parent->u2_y0 =
1548
144k
                            gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1549
144k
                        ps_cu_node->ps_parent->best_mode =
1550
144k
                            INTRA_DC;  //ps_ed_blk_l1->best_merge_mode;
1551
1552
                        /* fill in the first modes as invalid */
1553
1554
144k
                        ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
1555
144k
                        ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
1556
144k
                            INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
1557
144k
                        ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
1558
1559
144k
                        ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
1560
144k
                        ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
1561
144k
                        ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
1562
1563
144k
                        ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1564
1565
                        //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
1566
                        //ps_row_cu->u1_num_intra_rdopt_cands = 0;
1567
1568
144k
                        ps_intra32_analyse->b1_split_flag = 1;
1569
144k
                        ps_intra32_analyse->b1_merge_flag = 0;
1570
1571
144k
                        ps_intra16_analyse->b1_valid_cu = 0;
1572
144k
                        ps_intra16_analyse->b1_split_flag = 0;
1573
144k
                        ps_intra16_analyse->b1_merge_flag = 1;
1574
                        //memset (&ps_intra16_analyse->au1_best_modes_16x16_tu,
1575
                        //  255,
1576
                        //  NUM_BEST_MODES);
1577
                        //memset (&ps_intra16_analyse->au1_best_modes_8x8_tu,
1578
                        //  255,
1579
                        //  NUM_BEST_MODES);
1580
                        //set only first mode since if it's 255. it wont go ahead
1581
144k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 255;
1582
144k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 255;
1583
144k
                        *pi4_intra_16_cost = MAX_INTRA_COST_IPE;
1584
1585
                        /*since ME will start evaluating from bottom up, set the lower
1586
                        cu size data invalid */
1587
723k
                        for(i4_local_ctr = 0; i4_local_ctr < 4; i4_local_ctr++)
1588
578k
                        {
1589
578k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1590
578k
                                .au1_4x4_best_modes[0][0] = 255;
1591
578k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1592
578k
                                .au1_4x4_best_modes[1][0] = 255;
1593
578k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1594
578k
                                .au1_4x4_best_modes[2][0] = 255;
1595
578k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1596
578k
                                .au1_4x4_best_modes[3][0] = 255;
1597
578k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1598
578k
                                .au1_best_modes_8x8_tu[0] = 255;
1599
578k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1600
578k
                                .au1_best_modes_4x4_tu[0] = 255;
1601
1602
578k
                            pi4_intra_8_cost
1603
578k
                                [(i4_local_ctr & 1) + (MAX_CU_IN_CTB_ROW * (i4_local_ctr >> 1))] =
1604
578k
                                    MAX_INTRA_COST_IPE;
1605
578k
                        }
1606
1607
                        /* set neighbours even if intra is not evaluated, since source is always available. */
1608
144k
                        ihevce_set_nbr_map(
1609
144k
                            ps_ctxt->pu1_ctb_nbr_map,
1610
144k
                            ps_ctxt->i4_nbr_map_strd,
1611
144k
                            ps_cu_node->ps_parent->u2_x0 << 1,
1612
144k
                            ps_cu_node->ps_parent->u2_y0 << 1,
1613
144k
                            (ps_cu_node->ps_parent->u1_cu_size >> 2),
1614
144k
                            1);
1615
1616
                        //what happends to RC variables??
1617
                        /* run only constant Qp */
1618
144k
                        ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1619
144k
                        ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0] != -2);
1620
144k
                        i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1621
144k
                            ps_ctxt->i4_qscale,
1622
144k
                            ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0],
1623
144k
                            ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1624
144k
                            f_strength,
1625
144k
                            &i4_act_factor,
1626
144k
                            &i4_q_scale_q3_mod,
1627
144k
                            ps_ctxt->ps_rc_quant_ctxt);
1628
1629
                        /* cost accumalation of best cu size candiate */
1630
144k
                        i8_frame_acc_satd_cost += 0;  //parent_cost;  //incorrect accumulation
1631
1632
                        /*satd/mod_qp accumulation of best cu */
1633
144k
                        i8_frame_acc_satd_by_modqp_q10 += 0;  //incorrect accumulation
1634
                        //((LWORD64)ps_cu_node->ps_parent->best_satd << SATD_BY_ACT_Q_FAC)/i4_q_scale_q3_mod;
1635
1636
                        /* Accumalate mode bits for all child blocks */
1637
144k
                        i8_frame_acc_mode_bits_cost +=
1638
144k
                            0;  //ps_cu_node->ps_parent->u2_mode_bits_cost;
1639
                        //incoorect accumulation
1640
1641
144k
                        blk_cnt += 4;
1642
144k
                        ps_ed_blk_l1 += 4;
1643
                        //ps_row_cu++;
1644
144k
                        merge_64x64 = 0;
1645
1646
                        /* increment for stat purpose only. Increment is valid only on single thread */
1647
144k
                        ps_ctxt->u4_num_16x16_skips_at_L0_IPE += 1;
1648
144k
                    }
1649
361k
                    else
1650
361k
                    {
1651
                        /* 64x64 merge is not possible */
1652
361k
                        merge_64x64 = 0;
1653
1654
                        /* set the 32x32 split flag to 1 */
1655
361k
                        ps_intra32_analyse->b1_split_flag = 1;
1656
1657
361k
                        ps_intra32_analyse->b1_merge_flag = 0;
1658
1659
361k
                        ps_intra16_analyse->b1_merge_flag = 1;
1660
1661
361k
                        if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
1662
151k
                           (ps_ctxt->i4_slice_type == PSLICE))
1663
142k
                        {
1664
142k
                            ps_ctxt->u1_disable_child_cu_decide = 1;
1665
142k
                            step2_bypass = 0;
1666
142k
                        }
1667
                        //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1668
                        /* Based on the flag, Child modes decision can be disabled*/
1669
361k
                        if(0 == ps_ctxt->u1_disable_child_cu_decide)
1670
218k
                        {
1671
1.09M
                            for(j = 0; j < 4; j++)
1672
874k
                            {
1673
874k
                                intra8_analyse_t *ps_intra8_analyse;
1674
874k
                                WORD32 best_ang_mode = (ps_ed_blk_l1 + j)->best_mode;
1675
1676
874k
                                if(best_ang_mode < 2)
1677
733k
                                    best_ang_mode = 26;
1678
1679
                                //ps_cu_node->ps_sub_cu[j]->best_cost = MAX_INTRA_COST_IPE;
1680
                                //ps_cu_node->ps_sub_cu[j]->best_mode = (ps_ed_blk_l1 + j)->best_mode;
1681
1682
874k
                                ps_cu_node->ps_sub_cu[j]->u2_x0 =
1683
874k
                                    gau1_cu_pos_x[blk_cnt + j]; /* Populate properly */
1684
874k
                                ps_cu_node->ps_sub_cu[j]->u2_y0 =
1685
874k
                                    gau1_cu_pos_y[blk_cnt + j]; /* Populate properly */
1686
874k
                                ps_cu_node->ps_sub_cu[j]->u1_cu_size = 8;
1687
1688
874k
                                ihevce_mode_eval_filtering(
1689
874k
                                    ps_cu_node->ps_sub_cu[j],
1690
874k
                                    ps_cu_node,
1691
874k
                                    ps_ctxt,
1692
874k
                                    ps_curr_src,
1693
874k
                                    best_ang_mode,
1694
874k
                                    &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1695
874k
                                    &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1696
874k
                                    !step2_bypass,
1697
874k
                                    1);
1698
1699
874k
                                if(i4_enable_4cu_16tu)
1700
401k
                                {
1701
401k
                                    ihevce_mode_eval_filtering(
1702
401k
                                        ps_cu_node->ps_sub_cu[j],
1703
401k
                                        ps_cu_node,
1704
401k
                                        ps_ctxt,
1705
401k
                                        ps_curr_src,
1706
401k
                                        best_ang_mode,
1707
401k
                                        &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1708
401k
                                        &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1709
401k
                                        !step2_bypass,
1710
401k
                                        0);
1711
401k
                                }
1712
472k
                                else
1713
472k
                                {
1714
                                    /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1715
472k
                                    memcpy(
1716
472k
                                        &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1717
472k
                                        &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1718
472k
                                        NUM_BEST_MODES);
1719
1720
                                    /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1721
472k
                                    memcpy(
1722
472k
                                        &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1723
472k
                                        &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1724
472k
                                        NUM_BEST_MODES * sizeof(WORD32));
1725
472k
                                }
1726
1727
874k
                                child_cost[j] =
1728
874k
                                    MIN(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1729
874k
                                        ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0]);
1730
1731
874k
                                child_cost_least += child_cost[j];
1732
1733
                                /* Select the best mode to be populated as top and left nbr depending on the
1734
                                4tu and 1tu cost */
1735
874k
                                if(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0] >
1736
874k
                                   ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0])
1737
28.3k
                                {
1738
28.3k
                                    ps_cu_node->ps_sub_cu[j]->best_mode =
1739
28.3k
                                        ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
1740
28.3k
                                }
1741
845k
                                else
1742
845k
                                {
1743
845k
                                    ps_cu_node->ps_sub_cu[j]->best_mode =
1744
845k
                                        ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
1745
845k
                                }
1746
874k
                                { /* Update the CTB nodes only for MAX - 1 CU nodes */
1747
874k
                                    WORD32 xA, yA, row, col;
1748
874k
                                    xA = ((ps_cu_node->ps_sub_cu[j]->u2_x0 << 3) >> 2) + 1;
1749
874k
                                    yA = ((ps_cu_node->ps_sub_cu[j]->u2_y0 << 3) >> 2) + 1;
1750
874k
                                    size = ps_cu_node->ps_sub_cu[j]->u1_cu_size >> 2;
1751
2.62M
                                    for(row = yA; row < (yA + size); row++)
1752
1.74M
                                    {
1753
5.24M
                                        for(col = xA; col < (xA + size); col++)
1754
3.49M
                                        {
1755
3.49M
                                            ps_ctxt->au1_ctb_mode_map[row][col] =
1756
3.49M
                                                ps_cu_node->ps_sub_cu[j]->best_mode;
1757
3.49M
                                        }
1758
1.74M
                                    }
1759
874k
                                }
1760
1761
                                /*collect individual child satd for final SATD/qp accum*/
1762
874k
                                child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
1763
1764
874k
                                ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
1765
1766
                                /* store the child 8x8 costs */
1767
874k
                                pi4_intra_8_cost[(j & 1) + (MAX_CU_IN_CTB_ROW * (j >> 1))] =
1768
874k
                                    child_cost[j];
1769
1770
                                /* set the CU valid flag */
1771
874k
                                ps_intra8_analyse->b1_valid_cu = 1;
1772
874k
                                ps_intra8_analyse->b1_enable_nxn = 0;
1773
1774
                                /* storing the modes to intra8  analyse */
1775
1776
                                /* store the best 8x8 modes 8x8 tu */
1777
874k
                                memcpy(
1778
874k
                                    &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
1779
874k
                                    &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1780
874k
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
1781
874k
                                ps_intra8_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1782
1783
                                /* store the best 8x8 modes 4x4 tu */
1784
874k
                                memcpy(
1785
874k
                                    &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
1786
874k
                                    &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1787
874k
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
1788
874k
                                ps_intra8_analyse->au1_best_modes_4x4_tu[NUM_BEST_MODES] = 255;
1789
1790
                                /* NXN modes not evaluated hence set to 255 */
1791
874k
                                memset(
1792
874k
                                    &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1793
874k
                                    255,
1794
874k
                                    sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1795
874k
                            }
1796
1797
218k
                            ihevce_set_nbr_map(
1798
218k
                                ps_ctxt->pu1_ctb_nbr_map,
1799
218k
                                ps_ctxt->i4_nbr_map_strd,
1800
218k
                                ps_cu_node->ps_sub_cu[0]->u2_x0 << 1,
1801
218k
                                ps_cu_node->ps_sub_cu[0]->u2_y0 << 1,
1802
218k
                                (ps_cu_node->ps_sub_cu[0]->u1_cu_size >> 1),
1803
218k
                                0);
1804
218k
                        }
1805
142k
#if 1  //DISBLE_CHILD_CU_EVAL_L0_IPE //1
1806
142k
                        else
1807
142k
                        {
1808
714k
                            for(j = 0; j < 4; j++)
1809
571k
                            {
1810
571k
                                intra8_analyse_t *ps_intra8_analyse;
1811
571k
                                ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
1812
571k
                                ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
1813
571k
                                ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
1814
                                /* NXN modes not evaluated hence set to 255 */
1815
571k
                                memset(
1816
571k
                                    &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1817
571k
                                    255,
1818
571k
                                    sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1819
1820
571k
                                ps_intra8_analyse->b1_valid_cu = 0;
1821
571k
                                ps_intra8_analyse->b1_enable_nxn = 0;
1822
571k
                            }
1823
142k
                            child_cost_least = MAX_INTRA_COST_IPE;
1824
142k
                        }
1825
361k
#endif
1826
                        //ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
1827
                        //ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
1828
1829
361k
                        ps_cu_node->ps_parent->u1_cu_size = 16;
1830
361k
                        ps_cu_node->ps_parent->u2_x0 =
1831
361k
                            gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1832
361k
                        ps_cu_node->ps_parent->u2_y0 =
1833
361k
                            gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1834
1835
                        //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1836
1837
                        /* Eval for TUSize = CuSize */
1838
361k
                        ihevce_mode_eval_filtering(
1839
361k
                            ps_cu_node->ps_parent,
1840
361k
                            ps_cu_node,
1841
361k
                            ps_ctxt,
1842
361k
                            ps_curr_src,
1843
361k
                            26,
1844
361k
                            &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1845
361k
                            &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1846
361k
                            step2_bypass,
1847
361k
                            1);
1848
1849
361k
                        if(i4_enable_1cu_4tu)
1850
100k
                        {
1851
                            /* Eval for TUSize = CuSize/2 */
1852
100k
                            ihevce_mode_eval_filtering(
1853
100k
                                ps_cu_node->ps_parent,
1854
100k
                                ps_cu_node,
1855
100k
                                ps_ctxt,
1856
100k
                                ps_curr_src,
1857
100k
                                26,
1858
100k
                                &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1859
100k
                                &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1860
100k
                                step2_bypass,
1861
100k
                                0);
1862
100k
                        }
1863
260k
                        else
1864
260k
                        {
1865
                            /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1866
260k
                            memcpy(
1867
260k
                                &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1868
260k
                                &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1869
260k
                                NUM_BEST_MODES);
1870
1871
                            /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1872
260k
                            memcpy(
1873
260k
                                &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1874
260k
                                &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1875
260k
                                NUM_BEST_MODES * sizeof(WORD32));
1876
260k
                        }
1877
1878
361k
                        ps_ctxt->u1_disable_child_cu_decide = 0;
1879
361k
                        step2_bypass = 1;
1880
1881
                        /* Update parent cost */
1882
361k
                        parent_cost =
1883
361k
                            MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1884
361k
                                ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
1885
1886
                        /* Select the best mode to be populated as top and left nbr depending on the
1887
                        4tu and 1tu cost */
1888
361k
                        if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
1889
361k
                           ps_cu_node->ps_parent->au4_best_cost_1tu[0])
1890
8.45k
                        {
1891
8.45k
                            ps_cu_node->ps_parent->best_mode =
1892
8.45k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1893
8.45k
                        }
1894
352k
                        else
1895
352k
                        {
1896
352k
                            ps_cu_node->ps_parent->best_mode =
1897
352k
                                ps_cu_node->ps_parent->au1_best_mode_4tu[0];
1898
352k
                        }
1899
1900
                        /* store the 16x16 cost */
1901
361k
                        *pi4_intra_16_cost = parent_cost;
1902
1903
                        /* accumulate the 32x32 cost */
1904
361k
                        if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
1905
88.5k
                        {
1906
88.5k
                            *pi4_intra_32_cost = parent_cost;
1907
88.5k
                        }
1908
272k
                        else
1909
272k
                        {
1910
272k
                            *pi4_intra_32_cost += parent_cost;
1911
272k
                        }
1912
1913
                        /* set the CU valid flag */
1914
361k
                        ps_intra16_analyse->b1_valid_cu = 1;
1915
1916
                        /* storing the modes to intra 16 analyse */
1917
361k
                        {
1918
                            /* store the best 16x16 modes 16x16 tu */
1919
361k
                            memcpy(
1920
361k
                                &ps_intra16_analyse->au1_best_modes_16x16_tu[0],
1921
361k
                                &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1922
361k
                                sizeof(UWORD8) * NUM_BEST_MODES);
1923
361k
                            ps_intra16_analyse->au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1924
1925
                            /* store the best 16x16 modes 8x8 tu */
1926
361k
                            memcpy(
1927
361k
                                &ps_intra16_analyse->au1_best_modes_8x8_tu[0],
1928
361k
                                &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1929
361k
                                sizeof(UWORD8) * NUM_BEST_MODES);
1930
361k
                            ps_intra16_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1931
361k
                        }
1932
1933
361k
                        parent_best_mode = ps_cu_node->ps_parent->best_mode;
1934
361k
                        if(parent_cost <=
1935
361k
                           child_cost_least + (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >>
1936
361k
                                               LAMBDA_Q_SHIFT))  //|| identical_modes)
1937
330k
                        {
1938
330k
                            WORD32 i4_q_scale_q3_mod;
1939
330k
                            WORD8 i1_cu_possible_qp;
1940
330k
                            WORD32 i4_act_factor;
1941
                            //choose parent CU
1942
1943
330k
                            ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1944
1945
                            /* set the 16x16 non split flag */
1946
330k
                            ps_intra16_analyse->b1_split_flag = 0;
1947
1948
                            /*As 16*16 has won, pick L1 8x8 qp which maps
1949
                            to L0 16x16 Qp*/
1950
330k
                            ASSERT(((blk_cnt >> 4) & 3) == (blk_cnt >> 4));
1951
330k
                            ASSERT(ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0] != -2);
1952
330k
                            i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1953
330k
                                ps_ctxt->i4_qscale,
1954
330k
                                ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0],
1955
330k
                                ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1956
330k
                                f_strength,
1957
330k
                                &i4_act_factor,
1958
330k
                                &i4_q_scale_q3_mod,
1959
330k
                                ps_ctxt->ps_rc_quant_ctxt);
1960
1961
                            /* cost accumalation of best cu size candiate */
1962
330k
                            i8_frame_acc_satd_cost += parent_cost;
1963
1964
                            /* satd and mpm bits accumalation of best cu size candiate */
1965
330k
                            i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
1966
1967
                            /*satd/mod_qp accumulation of best cu */
1968
330k
                            i8_frame_acc_satd_by_modqp_q10 +=
1969
330k
                                ((LWORD64)ps_cu_node->ps_parent->best_satd
1970
330k
                                 << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1971
330k
                                i4_q_scale_q3_mod;
1972
1973
                            /* Accumalate mode bits for all child blocks */
1974
330k
                            i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;
1975
1976
330k
                            blk_cnt += 4;
1977
330k
                            ps_ed_blk_l1 += 4;
1978
                            //ps_row_cu++;
1979
330k
                        }
1980
30.7k
                        else
1981
30.7k
                        {
1982
                            //choose child CU
1983
30.7k
                            WORD8 i1_cu_possible_qp;
1984
30.7k
                            WORD32 i4_act_factor;
1985
30.7k
                            WORD32 i4_q_scale_q3_mod;
1986
1987
30.7k
                            ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1988
30.7k
                            ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][1] != -2);
1989
30.7k
                            i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1990
30.7k
                                ps_ctxt->i4_qscale,
1991
30.7k
                                ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][1],
1992
30.7k
                                ps_ctxt->ld_curr_frame_8x8_log_avg[1],
1993
30.7k
                                f_strength,
1994
30.7k
                                &i4_act_factor,
1995
30.7k
                                &i4_q_scale_q3_mod,
1996
30.7k
                                ps_ctxt->ps_rc_quant_ctxt);
1997
1998
                            /* set the 16x16 split flag */
1999
30.7k
                            ps_intra16_analyse->b1_split_flag = 1;
2000
2001
153k
                            for(j = 0; j < 4; j++)
2002
122k
                            {
2003
122k
                                ihevce_update_cand_list(
2004
122k
                                    ps_cu_node->ps_sub_cu[j], ps_ed_blk_l1, ps_ctxt);
2005
2006
122k
                                if((IHEVCE_QUALITY_P3 > i4_quality_preset))
2007
53.7k
                                {
2008
53.7k
                                    WORD32 k;
2009
53.7k
                                    intra8_analyse_t *ps_intra8_analyse;
2010
53.7k
                                    ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
2011
2012
268k
                                    for(k = 0; k < 4; k++)
2013
214k
                                    {
2014
                                        /* Populate best 3 nxn modes */
2015
214k
                                        ps_intra8_analyse->au1_4x4_best_modes[k][0] =
2016
214k
                                            ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
2017
214k
                                        ps_intra8_analyse->au1_4x4_best_modes[k][1] =
2018
214k
                                            ps_cu_node->ps_sub_cu[j]
2019
214k
                                                ->au1_best_mode_4tu[1];  //(ps_ed + 1)->best_mode;
2020
214k
                                        ps_intra8_analyse->au1_4x4_best_modes[k][2] =
2021
214k
                                            ps_cu_node->ps_sub_cu[j]
2022
214k
                                                ->au1_best_mode_4tu[2];  //(ps_ed + 2)->best_mode;
2023
214k
                                        ps_intra8_analyse->au1_4x4_best_modes[k][3] = 255;
2024
214k
                                    }
2025
53.7k
                                }
2026
                                /*accum satd/qp for all child block*/
2027
122k
                                i8_frame_acc_satd_by_modqp_q10 +=
2028
122k
                                    ((LWORD64)child_satd[j]
2029
122k
                                     << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2030
122k
                                    i4_q_scale_q3_mod;
2031
2032
                                /* Accumalate mode bits for all child blocks */
2033
122k
                                i8_frame_acc_mode_bits_cost +=
2034
122k
                                    ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
2035
2036
                                /* satd and mpm bits accumalation of best cu size candiate */
2037
122k
                                i4_ctb_acc_satd += child_satd[j];
2038
2039
122k
                                blk_cnt += 1;
2040
122k
                                ps_ed_blk_l1 += 1;
2041
                                //ps_row_cu++;
2042
122k
                            }
2043
2044
                            /* cost accumalation of best cu size candiate */
2045
30.7k
                            i8_frame_acc_satd_cost += child_cost_least;
2046
30.7k
                        }
2047
2048
361k
                    }  //else of EIID
2049
506k
#endif
2050
506k
                }  // if(merge_16x16_l1)
2051
                /* MAX CU SIZE 8x8 */
2052
208k
                else
2053
208k
                {
2054
#if IP_DBG_L1_l2
2055
                    for(i = 0; i < 4; i++)
2056
                    {
2057
                        ps_cu_node->ps_parent->u1_cu_size = 8;
2058
                        ps_cu_node->ps_parent->u2_x0 =
2059
                            gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2060
                        ps_cu_node->ps_parent->u2_y0 =
2061
                            gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2062
                        ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
2063
2064
                        ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2065
                        blk_cnt++;
2066
                        ps_ed_blk_l1++;
2067
                        ps_row_cu++;
2068
                        merge_64x64 = 0;
2069
                    }
2070
#else
2071
2072
                    /* EIID: Skip all 4 8x8 block if L1 decisions says skip intra */
2073
208k
                    if(ps_ed_blk_l1->intra_or_inter == 2 && (ps_ctxt->i4_slice_type != ISLICE))
2074
34.1k
                    {
2075
34.1k
                        WORD32 i4_q_scale_q3_mod;
2076
34.1k
                        WORD8 i1_cu_possible_qp;
2077
34.1k
                        WORD32 i4_act_factor;
2078
2079
34.1k
                        merge_64x64 = 0;
2080
2081
34.1k
                        ps_intra32_analyse->b1_merge_flag = 0;
2082
2083
34.1k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 255;
2084
34.1k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[1] = 255;
2085
34.1k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[2] = 255;
2086
2087
34.1k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 255;
2088
34.1k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[1] = 255;
2089
34.1k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[2] = 255;
2090
34.1k
                        ps_intra16_analyse->b1_split_flag = 1;
2091
34.1k
                        ps_intra16_analyse->b1_valid_cu = 0;
2092
34.1k
                        ps_intra16_analyse->b1_merge_flag = 0;
2093
2094
170k
                        for(i = 0; i < 4; i++)
2095
136k
                        {
2096
136k
                            intra8_analyse_t *ps_intra8_analyse;
2097
136k
                            WORD32 ctr_sub_cu;
2098
2099
136k
                            cu_pos_x = gau1_cu_pos_x[blk_cnt];
2100
136k
                            cu_pos_y = gau1_cu_pos_y[blk_cnt];
2101
2102
136k
                            if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
2103
121k
                            {
2104
121k
                                ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
2105
2106
121k
                                ps_intra8_analyse->b1_valid_cu = 0;
2107
121k
                                ps_intra8_analyse->b1_enable_nxn = 0;
2108
121k
                                ps_intra8_analyse->au1_4x4_best_modes[0][0] = 255;
2109
121k
                                ps_intra8_analyse->au1_4x4_best_modes[1][0] = 255;
2110
121k
                                ps_intra8_analyse->au1_4x4_best_modes[2][0] = 255;
2111
121k
                                ps_intra8_analyse->au1_4x4_best_modes[3][0] = 255;
2112
121k
                                ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
2113
121k
                                ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
2114
2115
121k
                                ps_cu_node->ps_parent->u1_cu_size = 8;
2116
121k
                                ps_cu_node->ps_parent->u2_x0 =
2117
121k
                                    gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2118
121k
                                ps_cu_node->ps_parent->u2_y0 =
2119
121k
                                    gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2120
121k
                                ps_cu_node->ps_parent->best_mode =
2121
121k
                                    INTRA_DC;  //ps_ed_blk_l1->best_mode;
2122
2123
                                /* fill in the first modes as invalid */
2124
2125
121k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
2126
121k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
2127
121k
                                    INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
2128
121k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
2129
2130
121k
                                ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
2131
121k
                                ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
2132
121k
                                ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
2133
2134
121k
                                ihevce_update_cand_list(
2135
121k
                                    ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2136
2137
                                //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
2138
                                //ps_row_cu->u1_num_intra_rdopt_cands = 0;
2139
2140
607k
                                for(ctr_sub_cu = 0; ctr_sub_cu < 4; ctr_sub_cu++)
2141
486k
                                {
2142
486k
                                    ps_cu_node->ps_sub_cu[ctr_sub_cu]->au1_best_mode_1tu[0] =
2143
486k
                                        INTRA_DC;
2144
486k
                                    ps_cu_node->ps_sub_cu[ctr_sub_cu]->au1_best_mode_4tu[0] =
2145
486k
                                        INTRA_DC;
2146
486k
                                    ps_cu_node->ps_sub_cu[ctr_sub_cu]->au4_best_cost_1tu[0] =
2147
486k
                                        MAX_INTRA_COST_IPE;
2148
2149
486k
                                    ps_cu_node->ps_sub_cu[ctr_sub_cu]->au4_best_cost_4tu[0] =
2150
486k
                                        MAX_INTRA_COST_IPE;
2151
486k
                                    ps_cu_node->ps_sub_cu[ctr_sub_cu]->best_cost =
2152
486k
                                        MAX_INTRA_COST_IPE;
2153
486k
                                }
2154
2155
121k
                                pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2156
121k
                                    MAX_INTRA_COST_IPE;
2157
2158
121k
                                ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
2159
121k
                                ASSERT(ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1] != -2);
2160
121k
                                i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2161
121k
                                    ps_ctxt->i4_qscale,
2162
121k
                                    ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1],
2163
121k
                                    ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2164
121k
                                    f_strength,
2165
121k
                                    &i4_act_factor,
2166
121k
                                    &i4_q_scale_q3_mod,
2167
121k
                                    ps_ctxt->ps_rc_quant_ctxt);
2168
2169
                                /* set neighbours even if intra is not evaluated, since source is always available. */
2170
121k
                                ihevce_set_nbr_map(
2171
121k
                                    ps_ctxt->pu1_ctb_nbr_map,
2172
121k
                                    ps_ctxt->i4_nbr_map_strd,
2173
121k
                                    ps_cu_node->ps_parent->u2_x0 << 1,
2174
121k
                                    ps_cu_node->ps_parent->u2_y0 << 1,
2175
121k
                                    (ps_cu_node->ps_parent->u1_cu_size >> 2),
2176
121k
                                    1);
2177
2178
                                //ps_row_cu++;
2179
121k
                            }
2180
136k
                            blk_cnt++;
2181
136k
                            ps_ed_blk_l1++;
2182
136k
                        }
2183
34.1k
                    }
2184
174k
                    else
2185
174k
                    {
2186
                        //cu_intra_cand_t *ps_cu_intra_cand;
2187
174k
                        WORD8 i1_cu_possible_qp;
2188
174k
                        WORD32 i4_act_factor;
2189
174k
                        WORD32 i4_q_scale_q3_mod;
2190
2191
174k
                        ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
2192
174k
                        ASSERT(ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1] != -2);
2193
174k
                        i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2194
174k
                            ps_ctxt->i4_qscale,
2195
174k
                            ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1],
2196
174k
                            ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2197
174k
                            f_strength,
2198
174k
                            &i4_act_factor,
2199
174k
                            &i4_q_scale_q3_mod,
2200
174k
                            ps_ctxt->ps_rc_quant_ctxt);
2201
2202
                        /* 64x64 merge is not possible */
2203
174k
                        merge_64x64 = 0;
2204
2205
174k
                        ps_intra32_analyse->b1_merge_flag = 0;
2206
2207
174k
                        ps_intra16_analyse->b1_merge_flag = 0;
2208
2209
                        /* by default 16x16 modes are set to default values DC and Planar */
2210
174k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 0;
2211
174k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[1] = 1;
2212
174k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[2] = 255;
2213
2214
174k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 0;
2215
174k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[1] = 1;
2216
174k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[2] = 255;
2217
174k
                        ps_intra16_analyse->b1_split_flag = 1;
2218
174k
                        ps_intra16_analyse->b1_valid_cu = 1;
2219
2220
871k
                        for(i = 0; i < 4; i++)
2221
697k
                        {
2222
697k
                            intra8_analyse_t *ps_intra8_analyse;
2223
697k
                            cu_pos_x = gau1_cu_pos_x[blk_cnt];
2224
697k
                            cu_pos_y = gau1_cu_pos_y[blk_cnt];
2225
697k
                            if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
2226
578k
                            {
2227
                                //ps_cu_intra_cand = &ps_row_cu->s_cu_intra_cand;
2228
                                //ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
2229
2230
                                //ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
2231
2232
578k
                                child_cost_least = 0;
2233
2234
578k
                                ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
2235
578k
                                ps_cu_node->ps_parent->u1_cu_size = 8;
2236
578k
                                ps_cu_node->ps_parent->u2_x0 =
2237
578k
                                    gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2238
578k
                                ps_cu_node->ps_parent->u2_y0 =
2239
578k
                                    gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2240
2241
                                //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
2242
2243
                                /*EARLY DECISION 8x8 block */
2244
578k
                                ihevce_pu_calc_8x8_blk(
2245
578k
                                    ps_curr_src, ps_ctxt, ps_cu_node, ps_ctxt->ps_func_selector);
2246
2.89M
                                for(j = 0; j < 4; j++)
2247
2.31M
                                {
2248
2.31M
                                    child_cost_least += ps_cu_node->ps_sub_cu[j]->best_cost;
2249
2.31M
                                    child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
2250
2.31M
                                }
2251
2252
                                /* Based on the flag, CU = 4TU modes decision can be disabled, CU = 4PU is retained */
2253
578k
                                if(0 == ps_ctxt->u1_disable_child_cu_decide)
2254
578k
                                {
2255
578k
                                    ihevce_set_nbr_map(
2256
578k
                                        ps_ctxt->pu1_ctb_nbr_map,
2257
578k
                                        ps_ctxt->i4_nbr_map_strd,
2258
578k
                                        ps_cu_node->ps_parent->u2_x0 << 1,
2259
578k
                                        ps_cu_node->ps_parent->u2_y0 << 1,
2260
578k
                                        (ps_cu_node->ps_parent->u1_cu_size >> 2),
2261
578k
                                        0);
2262
2263
                                    //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
2264
2265
                                    /* Eval for TUSize = CuSize */
2266
578k
                                    ihevce_mode_eval_filtering(
2267
578k
                                        ps_cu_node->ps_parent,
2268
578k
                                        ps_cu_node,
2269
578k
                                        ps_ctxt,
2270
578k
                                        ps_curr_src,
2271
578k
                                        26,
2272
578k
                                        &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2273
578k
                                        &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2274
578k
                                        step2_bypass,
2275
578k
                                        1);
2276
2277
578k
                                    if(i4_enable_1cu_4tu)
2278
314k
                                    {
2279
                                        /* Eval for TUSize = CuSize/2 */
2280
314k
                                        ihevce_mode_eval_filtering(
2281
314k
                                            ps_cu_node->ps_parent,
2282
314k
                                            ps_cu_node,
2283
314k
                                            ps_ctxt,
2284
314k
                                            ps_curr_src,
2285
314k
                                            26,
2286
314k
                                            &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2287
314k
                                            &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2288
314k
                                            step2_bypass,
2289
314k
                                            0);
2290
314k
                                    }
2291
263k
                                    else
2292
263k
                                    {
2293
                                        /* 4TU not evaluated :  4tu modes set same as 1tu modes */
2294
263k
                                        memcpy(
2295
263k
                                            &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2296
263k
                                            &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2297
263k
                                            NUM_BEST_MODES);
2298
2299
                                        /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
2300
263k
                                        memcpy(
2301
263k
                                            &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2302
263k
                                            &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2303
263k
                                            NUM_BEST_MODES * sizeof(WORD32));
2304
263k
                                    }
2305
2306
                                    /* Update parent cost */
2307
578k
                                    parent_cost =
2308
578k
                                        MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2309
578k
                                            ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
2310
2311
                                    /* Select the best mode to be populated as top and left nbr depending on the
2312
                            4tu and 1tu cost */
2313
578k
                                    if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
2314
578k
                                       ps_cu_node->ps_parent->au4_best_cost_1tu[0])
2315
47.7k
                                    {
2316
47.7k
                                        ps_cu_node->ps_parent->best_mode =
2317
47.7k
                                            ps_cu_node->ps_parent->au1_best_mode_1tu[0];
2318
47.7k
                                    }
2319
531k
                                    else
2320
531k
                                    {
2321
531k
                                        ps_cu_node->ps_parent->best_mode =
2322
531k
                                            ps_cu_node->ps_parent->au1_best_mode_4tu[0];
2323
531k
                                    }
2324
578k
                                }
2325
2326
                                /* set the CU valid flag */
2327
578k
                                ps_intra8_analyse->b1_valid_cu = 1;
2328
578k
                                ps_intra8_analyse->b1_enable_nxn = 0;
2329
2330
                                /* storing the modes to intra 8 analyse */
2331
2332
                                /* store the best 8x8 modes 8x8 tu */
2333
578k
                                memcpy(
2334
578k
                                    &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
2335
578k
                                    &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2336
578k
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
2337
578k
                                ps_intra8_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
2338
2339
                                /* store the best 8x8 modes 4x4 tu */
2340
578k
                                memcpy(
2341
578k
                                    &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
2342
578k
                                    &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2343
578k
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
2344
578k
                                ps_intra8_analyse->au1_best_modes_4x4_tu[NUM_BEST_MODES] = 255;
2345
2346
                                /*As 8*8 has won, pick L1 4x4 qp which is equal to
2347
                                L1 8x8 Qp*/
2348
                                //ps_row_cu->u1_cu_possible_qp[0] = u1_cu_possible_qp;
2349
                                //ps_row_cu->i4_act_factor[0][1] = i4_act_factor;
2350
2351
578k
                                parent_best_mode = ps_cu_node->ps_parent->best_mode;
2352
578k
                                if(parent_cost <=
2353
578k
                                   child_cost_least +
2354
578k
                                       (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >> LAMBDA_Q_SHIFT))
2355
317k
                                {
2356
                                    /*CU = 4TU */
2357
317k
                                    ihevce_update_cand_list(
2358
317k
                                        ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2359
2360
                                    /* store the child 8x8 costs */
2361
317k
                                    pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2362
317k
                                        parent_cost;
2363
2364
                                    /* cost accumalation of best cu size candiate */
2365
317k
                                    i8_frame_acc_satd_cost += parent_cost;
2366
2367
                                    /*satd/mod_qp accumulation of best cu */
2368
317k
                                    i8_frame_acc_satd_by_modqp_q10 +=
2369
317k
                                        ((LWORD64)ps_cu_node->ps_parent->best_satd
2370
317k
                                         << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2371
317k
                                        i4_q_scale_q3_mod;
2372
2373
                                    /* Accumalate mode bits for all child blocks */
2374
317k
                                    i8_frame_acc_mode_bits_cost +=
2375
317k
                                        ps_cu_node->ps_parent->u2_mode_bits_cost;
2376
2377
                                    /* satd and mpm bits accumalation of best cu size candiate */
2378
317k
                                    i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
2379
2380
                                    /* accumulate the 16x16 cost*/
2381
317k
                                    if(MAX_INTRA_COST_IPE == *pi4_intra_16_cost)
2382
91.1k
                                    {
2383
91.1k
                                        *pi4_intra_16_cost = parent_cost;
2384
91.1k
                                    }
2385
226k
                                    else
2386
226k
                                    {
2387
226k
                                        *pi4_intra_16_cost += parent_cost;
2388
226k
                                    }
2389
2390
                                    /* accumulate the 32x32 cost*/
2391
317k
                                    if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
2392
37.2k
                                    {
2393
37.2k
                                        *pi4_intra_32_cost = parent_cost;
2394
37.2k
                                    }
2395
280k
                                    else
2396
280k
                                    {
2397
280k
                                        *pi4_intra_32_cost += parent_cost;
2398
280k
                                    }
2399
317k
                                }
2400
261k
                                else
2401
261k
                                {
2402
                                    /*CU = 4PU*/
2403
                                    //ps_row_cu->b3_cu_pos_x = (UWORD8) ps_cu_node->ps_parent->u2_x0;
2404
                                    //ps_row_cu->b3_cu_pos_y = (UWORD8) ps_cu_node->ps_parent->u2_y0;
2405
                                    //ps_row_cu->u1_cu_size  = ps_cu_node->ps_parent->u1_cu_size;
2406
2407
                                    /* store the child 8x8 costs woth 4x4 pu summed cost */
2408
261k
                                    pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2409
261k
                                        (child_cost_least);
2410
2411
                                    /* accumulate the 16x16 cost*/
2412
261k
                                    if(MAX_INTRA_COST_IPE == *pi4_intra_16_cost)
2413
83.2k
                                    {
2414
83.2k
                                        *pi4_intra_16_cost = child_cost_least;
2415
83.2k
                                    }
2416
177k
                                    else
2417
177k
                                    {
2418
177k
                                        *pi4_intra_16_cost += child_cost_least;
2419
177k
                                    }
2420
2421
                                    /* cost accumalation of best cu size candiate */
2422
261k
                                    i8_frame_acc_satd_cost += child_cost_least;
2423
2424
1.30M
                                    for(j = 0; j < 4; j++)
2425
1.04M
                                    {
2426
                                        /*satd/qp accumualtion*/
2427
1.04M
                                        i8_frame_acc_satd_by_modqp_q10 +=
2428
1.04M
                                            ((LWORD64)child_satd[j]
2429
1.04M
                                             << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2430
1.04M
                                            i4_q_scale_q3_mod;
2431
2432
                                        /* Accumalate mode bits for all child blocks */
2433
1.04M
                                        i8_frame_acc_mode_bits_cost +=
2434
1.04M
                                            ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
2435
2436
                                        /* satd and mpm bits accumalation of best cu size candiate */
2437
1.04M
                                        i4_ctb_acc_satd += child_satd[j];
2438
1.04M
                                    }
2439
2440
                                    /* accumulate the 32x32 cost*/
2441
261k
                                    if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
2442
32.9k
                                    {
2443
32.9k
                                        *pi4_intra_32_cost = child_cost_least;
2444
32.9k
                                    }
2445
228k
                                    else
2446
228k
                                    {
2447
228k
                                        *pi4_intra_32_cost += child_cost_least;
2448
228k
                                    }
2449
2450
261k
                                    ps_intra8_analyse->b1_enable_nxn = 1;
2451
2452
                                    /* Insert the best 8x8 modes unconditionally */
2453
2454
261k
                                    x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
2455
261k
                                    y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
2456
261k
                                    size = ps_cu_node->u1_cu_size >> 2;
2457
2458
261k
                                    ps_ctxt->au1_ctb_mode_map[y][x] =
2459
261k
                                        ps_cu_node->ps_sub_cu[0]->best_mode;
2460
261k
                                    ps_ctxt->au1_ctb_mode_map[y][x + 1] =
2461
261k
                                        ps_cu_node->ps_sub_cu[1]->best_mode;
2462
261k
                                    ps_ctxt->au1_ctb_mode_map[y + 1][x] =
2463
261k
                                        ps_cu_node->ps_sub_cu[2]->best_mode;
2464
261k
                                    ps_ctxt->au1_ctb_mode_map[y + 1][x + 1] =
2465
261k
                                        ps_cu_node->ps_sub_cu[3]->best_mode;
2466
261k
                                }
2467
                                /* NXN mode population */
2468
2.89M
                                for(j = 0; j < 4; j++)
2469
2.31M
                                {
2470
2.31M
                                    cand_mode_list[0] =
2471
2.31M
                                        ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
2472
2.31M
                                    cand_mode_list[1] =
2473
2.31M
                                        ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[1];
2474
2.31M
                                    cand_mode_list[2] =
2475
2.31M
                                        ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[2];
2476
2477
2.31M
                                    if(1)
2478
2.31M
                                    {
2479
                                        /* Populate best 3 nxn modes */
2480
2.31M
                                        ps_intra8_analyse->au1_4x4_best_modes[j][0] =
2481
2.31M
                                            cand_mode_list[0];
2482
2.31M
                                        ps_intra8_analyse->au1_4x4_best_modes[j][1] =
2483
2.31M
                                            cand_mode_list[1];  //(ps_ed + 1)->best_mode;
2484
2.31M
                                        ps_intra8_analyse->au1_4x4_best_modes[j][2] =
2485
2.31M
                                            cand_mode_list[2];  //(ps_ed + 2)->best_mode;
2486
2.31M
                                        ps_intra8_analyse->au1_4x4_best_modes[j][3] = 255;
2487
2488
                                        //memcpy(ps_intra8_analyse->au1_4x4_best_modes[j], ps_row_cu->s_cu_intra_cand.au1_intra_luma_modes_nxn[j], 4);
2489
2.31M
                                    }
2490
                                    /* For HQ, all 35 modes to be used for RDOPT, removed from here for memory clean-up */
2491
2492
0
                                    else /* IHEVCE_QUALITY_P0 == i4_quality_preset */
2493
0
                                    {
2494
                                        /* To indicate to enc loop that NXN is enabled in HIGH QUALITY fior CU 8x8*/
2495
0
                                        ps_intra8_analyse->au1_4x4_best_modes[j][0] = 0;
2496
0
                                    }
2497
2498
2.31M
                                    ps_intra8_analyse
2499
2.31M
                                        ->au1_4x4_best_modes[j][MAX_INTRA_CU_CANDIDATES] = 255;
2500
2.31M
                                }
2501
2502
                                //ps_row_cu++;
2503
578k
                            }
2504
118k
                            else
2505
118k
                            {
2506
                                /* For Incomplete CTB, 16x16 is not valid */
2507
118k
                                ps_intra16_analyse->b1_valid_cu = 0;
2508
118k
                            }
2509
697k
                            blk_cnt++;
2510
697k
                            ps_ed_blk_l1++;
2511
697k
                        }
2512
                        //ps_ed_blk_l2 ++;
2513
174k
                    }  //else of EIID
2514
208k
#endif
2515
208k
                }
2516
1.30M
            }
2517
750k
            else
2518
750k
            {
2519
                /* For incomplete CTB, init valid CU to 0 */
2520
750k
                ps_ed_blk_l1++;
2521
750k
                ps_intra32_analyse->b1_valid_cu = 0;
2522
750k
                ps_intra16_analyse[0].b1_valid_cu = 0;
2523
750k
                blk_cnt++;
2524
750k
                merge_64x64 = 0;
2525
750k
            }
2526
2.05M
        } while(blk_cnt != MAX_CTB_SIZE);
2527
        /* if 64x64 merge is possible then check for 32x32 having same best modes */
2528
203k
        if(1 == merge_64x64)
2529
66.2k
        {
2530
66.2k
            WORD32 act_mode = au1_best_32x32_modes[0];
2531
2532
66.2k
            ps_ed_blk_l2 = ps_ed_l2_ctb;
2533
66.2k
            best_mode = ps_ed_blk_l2->best_mode;
2534
66.2k
            merge_64x64 =
2535
66.2k
                ((act_mode == au1_best_32x32_modes[0]) + (act_mode == au1_best_32x32_modes[1]) +
2536
66.2k
                     (act_mode == au1_best_32x32_modes[2]) +
2537
66.2k
                     (act_mode == au1_best_32x32_modes[3]) ==
2538
66.2k
                 4);
2539
66.2k
            if(merge_64x64 == 1)
2540
60.5k
                best_mode = au1_best_32x32_modes[0];
2541
5.71k
            else
2542
5.71k
                best_mode = ps_ed_blk_l2->best_mode;
2543
            /* All 32x32 costs are accumalated to 64x64 cost */
2544
66.2k
            ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = 0;
2545
331k
            for(i = 0; i < 4; i++)
2546
265k
            {
2547
265k
                ps_l0_ipe_out_ctb->i4_best64x64_intra_cost +=
2548
265k
                    ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i];
2549
265k
            }
2550
2551
            /* If all modes of 32x32 block is not same */
2552
66.2k
            if(0 == merge_64x64)
2553
5.71k
            {
2554
                /*Compute CHILD cost for 32x32 */
2555
5.71k
                WORD32 child_cost_64x64 = au4_best_32x32_cost[0] + au4_best_32x32_cost[1] +
2556
5.71k
                                          au4_best_32x32_cost[2] + au4_best_32x32_cost[3];
2557
5.71k
                WORD32 cost = MAX_INTRA_COST_IPE;
2558
2559
5.71k
                WORD32 best_mode_temp = 0;
2560
                /*Compute 64x64 cost for each mode of 32x32*/
2561
28.5k
                for(i = 0; i < 4; i++)
2562
22.8k
                {
2563
22.8k
                    WORD32 mode = au1_best_32x32_modes[i];
2564
22.8k
                    if(mode < 2)
2565
12.1k
                        mode = 26;
2566
22.8k
                    ps_cu_node->ps_parent->u1_cu_size = 64;
2567
22.8k
                    ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[0]; /* Populate properly */
2568
22.8k
                    ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[0]; /* Populate properly */
2569
2570
22.8k
                    ihevce_set_nbr_map(
2571
22.8k
                        ps_ctxt->pu1_ctb_nbr_map,
2572
22.8k
                        ps_ctxt->i4_nbr_map_strd,
2573
22.8k
                        (ps_cu_node->ps_parent->u2_x0 << 1),
2574
22.8k
                        (ps_cu_node->ps_parent->u2_y0 << 1),
2575
22.8k
                        (ps_cu_node->ps_parent->u1_cu_size >> 2),
2576
22.8k
                        0);
2577
2578
22.8k
                    ihevce_mode_eval_filtering(
2579
22.8k
                        ps_cu_node->ps_parent,
2580
22.8k
                        ps_cu_node,
2581
22.8k
                        ps_ctxt,
2582
22.8k
                        ps_curr_src,
2583
22.8k
                        mode,
2584
22.8k
                        &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2585
22.8k
                        &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2586
22.8k
                        !step2_bypass,
2587
22.8k
                        0);
2588
2589
22.8k
                    parent_cost = ps_cu_node->ps_parent->best_cost;
2590
22.8k
                    if(cost > parent_cost)
2591
6.82k
                    {
2592
6.82k
                        cost = parent_cost;
2593
6.82k
                        best_mode_temp = ps_cu_node->ps_parent->best_mode;
2594
6.82k
                    }
2595
22.8k
                }
2596
5.71k
                if(cost < child_cost_64x64)
2597
2.41k
                {
2598
2.41k
                    merge_64x64 = 1;
2599
2.41k
                    best_mode = best_mode_temp;
2600
2601
                    /* Update 64x64 cost if CU 64x64 is chosen  */
2602
2.41k
                    ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = cost;
2603
2604
                    /* Accumalate the least cost for CU 64x64 */
2605
2.41k
                    i8_frame_acc_satd_cost = cost;
2606
2.41k
                    i8_frame_acc_mode_bits_cost = ps_cu_node->ps_parent->u2_mode_bits_cost;
2607
2608
                    /* satd and mpm bits accumalation of best cu size candiate */
2609
2.41k
                    i4_ctb_acc_satd = ps_cu_node->ps_parent->best_satd;
2610
2.41k
                }
2611
5.71k
            }
2612
66.2k
        }
2613
2614
203k
        if(merge_64x64)
2615
62.9k
        {
2616
62.9k
            WORD32 i, j;
2617
62.9k
            intra32_analyse_t *ps_intra32_analyse;
2618
62.9k
            intra16_analyse_t *ps_intra16_analyse;
2619
62.9k
            WORD32 row, col;
2620
62.9k
            WORD32 i4_q_scale_q3_mod;
2621
62.9k
            WORD8 i1_cu_possible_qp;
2622
62.9k
            WORD32 i4_act_factor;
2623
            //ps_row_cu = ps_curr_cu;
2624
62.9k
            ps_ctb_out->u4_cu_split_flags = 0x0;
2625
62.9k
            ps_ed_blk_l1 = ps_ed_l1_ctb;
2626
62.9k
            ps_ed_blk_l2 = ps_ed_l2_ctb;
2627
2628
62.9k
            ps_l0_ipe_out_ctb->u1_split_flag = 0;
2629
2630
            /* If CU size of 64x64 is chosen, disbale all the 16x16 flag*/
2631
314k
            for(i = 0; i < 4; i++)
2632
251k
            {
2633
                /* get the corresponding intra 32 analyse pointer  use (blk_cnt / 16) */
2634
                /* blk cnt is in terms of 8x8 units so a 32x32 will have 16 8x8 units */
2635
251k
                ps_intra32_analyse = &ps_l0_ipe_out_ctb->as_intra32_analyse[i];
2636
2637
1.25M
                for(j = 0; j < 4; j++)
2638
1.00M
                {
2639
                    /* get the corresponding intra 16 analyse pointer use (blk_cnt & 0xF / 4)*/
2640
                    /* blk cnt is in terms of 8x8 units so a 16x16 will have 4 8x8 units */
2641
1.00M
                    ps_intra16_analyse = &ps_intra32_analyse->as_intra16_analyse[j];
2642
1.00M
                    ps_intra16_analyse->b1_merge_flag = 0;
2643
1.00M
                }
2644
251k
            }
2645
2646
            /* CU size 64x64 and fill the final cu params */
2647
            //ps_row_cu->b3_cu_pos_x = gau1_cu_pos_x[0];
2648
            //ps_row_cu->b3_cu_pos_y = gau1_cu_pos_y[0];
2649
            //ps_row_cu->u1_cu_size  = 64;
2650
2651
            /* Candidate mode Update */
2652
62.9k
            cand_mode_list[0] = best_mode;
2653
62.9k
            if(cand_mode_list[0] > 1)
2654
9.28k
            {
2655
9.28k
                if(cand_mode_list[0] == 2)
2656
1.02k
                {
2657
1.02k
                    cand_mode_list[1] = 34;
2658
1.02k
                    cand_mode_list[2] = 3;
2659
1.02k
                }
2660
8.26k
                else if(cand_mode_list[0] == 34)
2661
10
                {
2662
10
                    cand_mode_list[1] = 2;
2663
10
                    cand_mode_list[2] = 33;
2664
10
                }
2665
8.25k
                else
2666
8.25k
                {
2667
8.25k
                    cand_mode_list[1] = cand_mode_list[0] - 1;
2668
8.25k
                    cand_mode_list[2] = cand_mode_list[0] + 1;
2669
8.25k
                }
2670
                //cand_mode_list[1] = ps_ed_blk_l1->nang_attr.best_mode;
2671
                //cand_mode_list[2] = ps_ed_blk_l1->ang_attr.best_mode;
2672
9.28k
            }
2673
53.6k
            else
2674
53.6k
            {
2675
53.6k
                cand_mode_list[0] = 0;
2676
53.6k
                cand_mode_list[1] = 1;
2677
53.6k
                cand_mode_list[2] = 26;
2678
                //cand_mode_list[2] = ps_ed_blk_l1->nang_attr.best_mode;
2679
53.6k
            }
2680
2681
            /* All 32x32 costs are accumalated to 64x64 cost */
2682
62.9k
            ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = 0;
2683
314k
            for(i = 0; i < 4; i++)
2684
251k
            {
2685
251k
                ps_l0_ipe_out_ctb->i4_best64x64_intra_cost +=
2686
251k
                    ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i];
2687
251k
            }
2688
            /* by default 64x64 modes are set to default values DC and Planar */
2689
62.9k
            ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[0] = cand_mode_list[0];
2690
62.9k
            ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[1] = cand_mode_list[1];
2691
62.9k
            ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[2] = cand_mode_list[2];
2692
62.9k
            ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[3] = 255;
2693
2694
            /* Update CTB mode map for the finalised CU */
2695
62.9k
            x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
2696
62.9k
            y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
2697
62.9k
            size = ps_cu_node->u1_cu_size >> 2;
2698
2699
585k
            for(row = y; row < (y + size); row++)
2700
523k
            {
2701
5.01M
                for(col = x; col < (x + size); col++)
2702
4.49M
                {
2703
4.49M
                    ps_ctxt->au1_ctb_mode_map[row][col] = best_mode;
2704
4.49M
                }
2705
523k
            }
2706
2707
62.9k
            ihevce_set_nbr_map(
2708
62.9k
                ps_ctxt->pu1_ctb_nbr_map,
2709
62.9k
                ps_ctxt->i4_nbr_map_strd,
2710
62.9k
                (ps_cu_node->u2_x0 << 1),
2711
62.9k
                (ps_cu_node->u2_y0 << 1),
2712
62.9k
                (ps_cu_node->u1_cu_size >> 2),
2713
62.9k
                1);
2714
2715
            /*As 64*64 has won, pick L1 32x32 qp*/
2716
            //ASSERT(((blk_cnt>>6) & 0xF) == (blk_cnt>>6));
2717
            //ASSERT((blk_cnt>>6) == 0);
2718
62.9k
            ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][0] != -2);
2719
62.9k
            i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2720
62.9k
                ps_ctxt->i4_qscale,
2721
62.9k
                ps_ed_ctb_l1->i4_32x32_satd[0][0],
2722
62.9k
                ps_ctxt->ld_curr_frame_32x32_log_avg[0],
2723
62.9k
                f_strength,
2724
62.9k
                &i4_act_factor,
2725
62.9k
                &i4_q_scale_q3_mod,
2726
62.9k
                ps_ctxt->ps_rc_quant_ctxt);
2727
2728
62.9k
            i8_frame_acc_satd_by_modqp_q10 =
2729
62.9k
                (i8_frame_acc_satd_cost << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2730
62.9k
                i4_q_scale_q3_mod;
2731
            /* Increment pointers */
2732
62.9k
            ps_ed_blk_l1 += 64;
2733
62.9k
            ps_ed_blk_l2 += 16;
2734
            //ps_row_cu++;
2735
62.9k
        }
2736
203k
    }
2737
2738
    //ps_ctb_out->u1_num_cus_in_ctb = (UWORD8)(ps_row_cu - ps_curr_cu);
2739
2740
203k
    {
2741
203k
        WORD32 i4_i, i4_j;
2742
203k
        WORD32 dummy;
2743
203k
        WORD8 i1_cu_qp;
2744
203k
        (void)i1_cu_qp;
2745
        /*MAM_VAR_L1*/
2746
609k
        for(i4_j = 0; i4_j < 2; i4_j++)
2747
406k
        {
2748
406k
            i4_mod_factor_num = ps_ctxt->ai4_mod_factor_derived_by_variance[i4_j];
2749
406k
            f_strength = ps_ctxt->f_strength;
2750
2751
            //i4_mod_factor_num = 4;
2752
2753
406k
            ps_ed_blk_l1 = ps_ed_l1_ctb;
2754
406k
            ps_ed_blk_l2 = ps_ed_l2_ctb;
2755
            //ps_row_cu = ps_curr_cu;
2756
2757
            /*Valid only for complete CTB */
2758
406k
            if((64 == u1_curr_ctb_wdt) && (64 == u1_curr_ctb_hgt))
2759
361k
            {
2760
361k
                ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][0] != -2);
2761
361k
                ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][1] != -2);
2762
361k
                ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][2] != -2);
2763
361k
                ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][3] != -2);
2764
2765
361k
                i1_cu_qp = ihevce_cu_level_qp_mod(
2766
361k
                    ps_ctxt->i4_qscale,
2767
361k
                    ps_ed_ctb_l1->i4_32x32_satd[0][0],
2768
361k
                    ps_ctxt->ld_curr_frame_32x32_log_avg[0],
2769
361k
                    f_strength,
2770
361k
                    &ps_l0_ipe_out_ctb->i4_64x64_act_factor[0][i4_j],
2771
361k
                    &dummy,
2772
361k
                    ps_ctxt->ps_rc_quant_ctxt);
2773
2774
361k
                i1_cu_qp = ihevce_cu_level_qp_mod(
2775
361k
                    ps_ctxt->i4_qscale,
2776
361k
                    ps_ed_ctb_l1->i4_32x32_satd[0][1],
2777
361k
                    ps_ctxt->ld_curr_frame_32x32_log_avg[1],
2778
361k
                    f_strength,
2779
361k
                    &ps_l0_ipe_out_ctb->i4_64x64_act_factor[1][i4_j],
2780
361k
                    &dummy,
2781
361k
                    ps_ctxt->ps_rc_quant_ctxt);
2782
361k
                i1_cu_qp = ihevce_cu_level_qp_mod(
2783
361k
                    ps_ctxt->i4_qscale,
2784
361k
                    ps_ed_ctb_l1->i4_32x32_satd[0][2],
2785
361k
                    ps_ctxt->ld_curr_frame_32x32_log_avg[2],
2786
361k
                    f_strength,
2787
361k
                    &ps_l0_ipe_out_ctb->i4_64x64_act_factor[2][i4_j],
2788
361k
                    &dummy,
2789
361k
                    ps_ctxt->ps_rc_quant_ctxt);
2790
2791
361k
                i1_cu_qp = ihevce_cu_level_qp_mod(
2792
361k
                    ps_ctxt->i4_qscale,
2793
361k
                    ps_ed_ctb_l1->i4_32x32_satd[0][3],
2794
361k
                    2.0 + ps_ctxt->ld_curr_frame_16x16_log_avg[0],
2795
361k
                    f_strength,
2796
361k
                    &ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j],
2797
361k
                    &dummy,
2798
361k
                    ps_ctxt->ps_rc_quant_ctxt);
2799
2800
361k
                ASSERT(ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j] > 0);
2801
361k
            }
2802
44.5k
            else
2803
44.5k
            {
2804
44.5k
                ps_l0_ipe_out_ctb->i4_64x64_act_factor[0][i4_j] = 1024;
2805
44.5k
                ps_l0_ipe_out_ctb->i4_64x64_act_factor[1][i4_j] = 1024;
2806
44.5k
                ps_l0_ipe_out_ctb->i4_64x64_act_factor[2][i4_j] = 1024;
2807
44.5k
                ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j] = 1024;
2808
44.5k
            }
2809
2810
            /*Store the 8x8 Qps from L2 (in raster order) as output of intra prediction
2811
            for the usage by ME*/
2812
2813
406k
            {
2814
406k
                WORD32 pos_x_32, pos_y_32, pos;
2815
                //WORD32 i4_incomplete_ctb_val_8;
2816
406k
                pos_x_32 = u1_curr_ctb_wdt / 16;
2817
406k
                pos_y_32 = u1_curr_ctb_hgt / 16;
2818
2819
406k
                pos = (pos_x_32 < pos_y_32) ? pos_x_32 : pos_y_32;
2820
2821
2.03M
                for(i4_i = 0; i4_i < 4; i4_i++)
2822
1.62M
                {
2823
1.62M
                    if(i4_i < pos)
2824
1.49M
                    {
2825
1.49M
                        ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] != -2);
2826
1.49M
                        ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] != -2);
2827
1.49M
                        ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] != -2);
2828
1.49M
                        i1_cu_qp = ihevce_cu_level_qp_mod(
2829
1.49M
                            ps_ctxt->i4_qscale,
2830
1.49M
                            ps_ed_ctb_l1->i4_16x16_satd[i4_i][0],
2831
1.49M
                            ps_ctxt->ld_curr_frame_16x16_log_avg[0],
2832
1.49M
                            f_strength,
2833
1.49M
                            &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][0][i4_j],
2834
1.49M
                            &dummy,
2835
1.49M
                            ps_ctxt->ps_rc_quant_ctxt);
2836
1.49M
                        i1_cu_qp = ihevce_cu_level_qp_mod(
2837
1.49M
                            ps_ctxt->i4_qscale,
2838
1.49M
                            ps_ed_ctb_l1->i4_16x16_satd[i4_i][1],
2839
1.49M
                            ps_ctxt->ld_curr_frame_16x16_log_avg[1],
2840
1.49M
                            f_strength,
2841
1.49M
                            &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][1][i4_j],
2842
1.49M
                            &dummy,
2843
1.49M
                            ps_ctxt->ps_rc_quant_ctxt);
2844
1.49M
                        i1_cu_qp = ihevce_cu_level_qp_mod(
2845
1.49M
                            ps_ctxt->i4_qscale,
2846
1.49M
                            ps_ed_ctb_l1->i4_16x16_satd[i4_i][2],
2847
1.49M
                            ps_ctxt->ld_curr_frame_16x16_log_avg[2],
2848
1.49M
                            f_strength,
2849
1.49M
                            &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][2][i4_j],
2850
1.49M
                            &dummy,
2851
1.49M
                            ps_ctxt->ps_rc_quant_ctxt);
2852
1.49M
                    }
2853
126k
                    else
2854
126k
                    {
2855
                        /*For incomplete CTB */
2856
126k
                        ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][0][i4_j] = 1024;
2857
126k
                        ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][1][i4_j] = 1024;
2858
126k
                        ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][2][i4_j] = 1024;
2859
126k
                    }
2860
1.62M
                }
2861
406k
            }
2862
2863
            /*Store the 8x8 Qps from L1 (in raster order) as output of intra prediction
2864
            for the usage by ME*/
2865
406k
            {
2866
406k
                WORD32 pos_x_16, pos_y_16, pos;
2867
                //WORD32 i4_incomplete_ctb_val_8;
2868
406k
                pos_x_16 = u1_curr_ctb_wdt / 4;
2869
406k
                pos_y_16 = u1_curr_ctb_hgt / 4;
2870
2871
406k
                pos = (pos_x_16 < pos_y_16) ? pos_x_16 : pos_y_16;
2872
6.91M
                for(i4_i = 0; i4_i < 16; i4_i++)
2873
6.50M
                {
2874
6.50M
                    if(i4_i < pos)
2875
6.06M
                    {
2876
6.06M
                        ASSERT(ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] != -2);
2877
6.06M
                        ASSERT(ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] != -2);
2878
6.06M
                        i1_cu_qp = ihevce_cu_level_qp_mod(
2879
6.06M
                            ps_ctxt->i4_qscale,
2880
6.06M
                            ps_ed_ctb_l1->i4_8x8_satd[i4_i][0],
2881
6.06M
                            ps_ctxt->ld_curr_frame_8x8_log_avg[0],
2882
6.06M
                            f_strength,
2883
6.06M
                            &ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][0][i4_j],
2884
6.06M
                            &dummy,
2885
6.06M
                            ps_ctxt->ps_rc_quant_ctxt);
2886
6.06M
                        i1_cu_qp = ihevce_cu_level_qp_mod(
2887
6.06M
                            ps_ctxt->i4_qscale,
2888
6.06M
                            ps_ed_ctb_l1->i4_8x8_satd[i4_i][1],
2889
6.06M
                            ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2890
6.06M
                            f_strength,
2891
6.06M
                            &ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][1][i4_j],
2892
6.06M
                            &dummy,
2893
6.06M
                            ps_ctxt->ps_rc_quant_ctxt);
2894
6.06M
                    }
2895
437k
                    else
2896
437k
                    {
2897
                        /*For incomplete CTB */
2898
437k
                        ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][0][i4_j] = 1024;
2899
437k
                        ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][1][i4_j] = 1024;
2900
437k
                    }
2901
6.50M
                }
2902
406k
            }
2903
406k
        }  //for loop
2904
2905
        /* Accumalate the cost of ctb to the total cost */
2906
203k
        ps_ctxt->i8_frame_acc_satd_cost += i8_frame_acc_satd_cost;
2907
203k
        ps_ctxt->i8_frame_acc_satd_by_modqp_q10 += i8_frame_acc_satd_by_modqp_q10;
2908
2909
203k
        ps_ctxt->i8_frame_acc_mode_bits_cost += i8_frame_acc_mode_bits_cost;
2910
2911
        /* satd and mpm bits accumalation of best cu size candiate for the ctb */
2912
203k
        ps_l0_ipe_out_ctb->i4_ctb_acc_satd = i4_ctb_acc_satd;
2913
203k
        ps_l0_ipe_out_ctb->i4_ctb_acc_mpm_bits = i8_frame_acc_mode_bits_cost;
2914
2915
203k
        ps_ctxt->i8_frame_acc_satd += i4_ctb_acc_satd;
2916
203k
    }
2917
2918
203k
    {
2919
203k
        WORD32 ctr_8x8;
2920
3.45M
        for(ctr_8x8 = 0; ctr_8x8 < (MAX_CU_IN_CTB >> 2); ctr_8x8++)
2921
3.25M
        {
2922
            /*Accumalate activity factor for Intra and Inter*/
2923
3.25M
            if(ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_ipe[ctr_8x8] <
2924
3.25M
               ps_ed_ctb_l1->i4_sad_me_for_ref[ctr_8x8])
2925
64.3k
            {
2926
64.3k
                ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8] =
2927
64.3k
                    ps_l0_ipe_out_ctb->i4_16x16_act_factor[ctr_8x8][1][0];
2928
64.3k
            }
2929
3.18M
            else
2930
3.18M
            {
2931
3.18M
                ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8] =
2932
3.18M
                    ps_l0_ipe_out_ctb->i4_16x16_act_factor[ctr_8x8][1][0];
2933
3.18M
            }
2934
2935
            /*Accumalate activity factor at frame level*/
2936
3.25M
            ps_ctxt->i8_frame_acc_act_factor += ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8];
2937
3.25M
        }
2938
203k
    }
2939
203k
    return;
2940
203k
}
2941
2942
WORD32 ihevce_nxn_sad_computer(
2943
    UWORD8 *pu1_inp, WORD32 i4_inp_stride, UWORD8 *pu1_ref, WORD32 i4_ref_stride, WORD32 trans_size)
2944
1.90M
{
2945
1.90M
    WORD32 wd, ht, i, j;
2946
1.90M
    WORD32 sad = 0;
2947
2948
1.90M
    wd = trans_size;
2949
1.90M
    ht = trans_size;
2950
2951
28.6M
    for(i = 0; i < ht; i++)
2952
26.7M
    {
2953
495M
        for(j = 0; j < wd; j++)
2954
468M
        {
2955
468M
            sad += (ABS(((WORD32)pu1_inp[j] - (WORD32)pu1_ref[j])));
2956
468M
        }
2957
26.7M
        pu1_inp += i4_inp_stride;
2958
26.7M
        pu1_ref += i4_ref_stride;
2959
26.7M
    }
2960
2961
1.90M
    return sad;
2962
1.90M
}
2963
2964
/*!
2965
******************************************************************************
2966
* \if Function name : ihevce_mode_eval_filtering \endif
2967
*
2968
* \brief
2969
*    Evaluates best 3 modes for the given CU size with probable modes from,
2970
*    early decision structure, mpm candidates and dc, planar mode
2971
*
2972
* \param[in] ps_cu_node : pointer to MAX cu node info buffer
2973
* \param[in] ps_child_cu_node : pointer to (MAX - 1) cu node info buffer
2974
* \param[in] ps_ctxt : pointer to IPE context struct
2975
* \param[in] ps_curr_src : pointer to src pixels struct
2976
* \param[in] best_amode : best angular mode from l1 layer or
2977
                            from (MAX - 1) CU mode
2978
* \param[in] best_costs_4x4  : pointer to 3 best cost buffer
2979
* \param[in] best_modes_4x4  : pointer to 3 best mode buffer
2980
* \param[in] step2_bypass : if 0, (MAX - 1) CU is evaluated
2981
*                           if 1, (MAX CU) sugested is evaluated
2982
* \param[in] tu_eq_cu     : indicates if tu size is same as cu or cu/2
2983
*
2984
* \return
2985
*    None
2986
*
2987
* \author
2988
*  Ittiam
2989
*
2990
*****************************************************************************
2991
*/
2992
void ihevce_mode_eval_filtering(
2993
    ihevce_ipe_cu_tree_t *ps_cu_node,
2994
    ihevce_ipe_cu_tree_t *ps_child_cu_node,
2995
    ihevce_ipe_ctxt_t *ps_ctxt,
2996
    iv_enc_yuv_buf_t *ps_curr_src,
2997
    WORD32 best_amode,
2998
    WORD32 *best_costs_4x4,
2999
    UWORD8 *best_modes_4x4,
3000
    WORD32 step2_bypass,
3001
    WORD32 tu_eq_cu)
3002
7.05M
{
3003
7.05M
    UWORD8 *pu1_origin, *pu1_orig;
3004
7.05M
    WORD32 src_strd = ps_curr_src->i4_y_strd;
3005
7.05M
    WORD32 nbr_flags;
3006
7.05M
    nbr_avail_flags_t s_nbr;
3007
7.05M
    WORD32 trans_size = tu_eq_cu ? ps_cu_node->u1_cu_size : ps_cu_node->u1_cu_size >> 1;
3008
7.05M
    WORD32 num_tu_in_x = tu_eq_cu ? 1 : 2;
3009
7.05M
    WORD32 num_tu_in_y = tu_eq_cu ? 1 : 2;
3010
7.05M
    UWORD8 mode;
3011
3012
7.05M
    WORD32 cost_ang_mode = MAX_INTRA_COST_IPE;
3013
7.05M
    WORD32 filter_flag;
3014
7.05M
    WORD32 cost_amode_step2[7] = { 0 };
3015
    /*WORD32 best_sad[5];  // NOTE_A01: Not getting consumed at present */
3016
7.05M
    WORD32 sad = 0;
3017
7.05M
    WORD32 cu_pos_x, cu_pos_y;
3018
7.05M
    WORD32 temp;
3019
7.05M
    WORD32 i = 0, j, k, i_end, z;
3020
    //WORD32 row, col, size;
3021
7.05M
    UWORD8 *pu1_ref;
3022
7.05M
    WORD32 xA, yA, xB, yB;
3023
7.05M
    WORD32 top_intra_mode;
3024
7.05M
    WORD32 left_intra_mode;
3025
7.05M
    UWORD8 *pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3026
7.05M
    UWORD8 *pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3027
3028
7.05M
    UWORD8 modes_4x4[5] = { 0, 1, 2, 3, 4 };
3029
7.05M
    WORD32 count;
3030
3031
7.05M
    pf_ipe_res_trans_had apf_resd_trns_had[4];
3032
3033
7.05M
    WORD32 cand_mode_satd_list[3];
3034
7.05M
    ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
3035
3036
7.05M
    ihevc_intra_pred_luma_ref_substitution_fptr =
3037
7.05M
        ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
3038
3039
7.05M
    apf_resd_trns_had[0] = ps_ctxt->s_cmn_opt_func.pf_HAD_4x4_8bit;
3040
7.05M
    apf_resd_trns_had[1] = ps_ctxt->s_cmn_opt_func.pf_HAD_8x8_8bit;
3041
7.05M
    apf_resd_trns_had[2] = ps_ctxt->s_cmn_opt_func.pf_HAD_16x16_8bit;
3042
7.05M
    apf_resd_trns_had[3] = ps_ctxt->s_cmn_opt_func.pf_HAD_32x32_8bit;
3043
3044
    /* initialize modes_to_eval as zero */
3045
7.05M
    memset(&ps_ctxt->au1_modes_to_eval, 0, MAX_NUM_IP_MODES);
3046
3047
    /* Compute the Parent Cost */
3048
3049
    /* Pointer to top-left of the CU - y0,x0 in 8x8 granularity */
3050
7.05M
    pu1_orig = (UWORD8 *)(ps_curr_src->pv_y_buf) + ((ps_cu_node->u2_y0 << 3) * src_strd) +
3051
7.05M
               (ps_cu_node->u2_x0 << 3);
3052
3053
    /* Get position of CU within CTB at 4x4 granularity */
3054
7.05M
    cu_pos_x = ps_cu_node->u2_x0 << 1;
3055
7.05M
    cu_pos_y = ps_cu_node->u2_y0 << 1;
3056
3057
    /* get the neighbour availability flags */
3058
7.05M
    ihevce_get_only_nbr_flag(
3059
7.05M
        &s_nbr,
3060
7.05M
        ps_ctxt->pu1_ctb_nbr_map,
3061
7.05M
        ps_ctxt->i4_nbr_map_strd,
3062
7.05M
        cu_pos_x,
3063
7.05M
        cu_pos_y,
3064
7.05M
        trans_size >> 2,
3065
7.05M
        trans_size >> 2);
3066
3067
    /* Traverse for all 4 child blocks in the parent block */
3068
7.05M
    xA = (ps_cu_node->u2_x0 << 3) >> 2;
3069
7.05M
    yA = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
3070
7.05M
    xB = xA + 1;
3071
7.05M
    yB = yA - 1;
3072
7.05M
    left_intra_mode = ps_ctxt->au1_ctb_mode_map[yA][xA];
3073
7.05M
    top_intra_mode = ps_ctxt->au1_ctb_mode_map[yB][xB];
3074
    /* call the function which populates sad cost for all the modes */
3075
3076
7.05M
    ihevce_intra_populate_mode_bits_cost_bracketing(
3077
7.05M
        top_intra_mode,
3078
7.05M
        left_intra_mode,
3079
7.05M
        s_nbr.u1_top_avail,
3080
7.05M
        s_nbr.u1_left_avail,
3081
7.05M
        ps_cu_node->u2_y0,
3082
7.05M
        &ps_ctxt->au2_mode_bits_satd_cost[0],
3083
7.05M
        &ps_ctxt->au2_mode_bits_satd[0],
3084
7.05M
        ps_ctxt->i4_ol_satd_lambda,
3085
7.05M
        cand_mode_satd_list);
3086
3087
16.5M
    for(k = 0; k < num_tu_in_y; k++)
3088
9.51M
    {
3089
23.9M
        for(j = 0; j < num_tu_in_x; j++)
3090
14.4M
        {
3091
            /* get the neighbour availability flags */
3092
14.4M
            nbr_flags = ihevce_get_nbr_intra(
3093
14.4M
                &s_nbr,
3094
14.4M
                ps_ctxt->pu1_ctb_nbr_map,
3095
14.4M
                ps_ctxt->i4_nbr_map_strd,
3096
14.4M
                cu_pos_x + ((j) * (trans_size >> 2)),
3097
14.4M
                cu_pos_y + ((k) * (trans_size >> 2)),
3098
14.4M
                trans_size >> 2);
3099
3100
14.4M
            pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3101
3102
            /* Create reference samples array */
3103
14.4M
            ihevc_intra_pred_luma_ref_substitution_fptr(
3104
14.4M
                pu1_origin - src_strd - 1,
3105
14.4M
                pu1_origin - src_strd,
3106
14.4M
                pu1_origin - 1,
3107
14.4M
                src_strd,
3108
14.4M
                trans_size,
3109
14.4M
                nbr_flags,
3110
14.4M
                pu1_ref_orig,
3111
14.4M
                0);
3112
3113
            /* Perform reference samples filtering */
3114
14.4M
            ihevce_intra_pred_ref_filtering(pu1_ref_orig, trans_size, pu1_ref_filt);
3115
3116
14.4M
            ihevce_set_nbr_map(
3117
14.4M
                ps_ctxt->pu1_ctb_nbr_map,
3118
14.4M
                ps_ctxt->i4_nbr_map_strd,
3119
14.4M
                cu_pos_x + ((j) * (trans_size >> 2)),
3120
14.4M
                cu_pos_y + ((k) * (trans_size >> 2)),
3121
14.4M
                (trans_size >> 2),
3122
14.4M
                1);
3123
3124
14.4M
            pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3125
14.4M
            pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3126
14.4M
        }
3127
9.51M
    }
3128
3129
    /* Revaluation for angular mode */
3130
    //if(ps_ed_blk->ang_attr.mode_present == 1)
3131
    //if(((best_amode & 0x1) != 1))
3132
3133
7.05M
    {
3134
7.05M
        WORD32 u1_trans_idx = trans_size >> 3;
3135
7.05M
        if(trans_size == 32)
3136
577k
            u1_trans_idx = 3;
3137
        //best_amode = ps_ed_blk->ang_attr.best_mode;
3138
3139
7.05M
        i = 0;
3140
7.05M
        if(!step2_bypass)
3141
4.95M
        {
3142
            /* Around best level 4 angular mode, search for best level 2 mode */
3143
4.95M
            ASSERT((best_amode >= 2) && (best_amode <= 34));
3144
3145
4.95M
            if(ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P3)
3146
3.93M
            {
3147
3.93M
                if(best_amode >= 4)
3148
3.91M
                    ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode - 2;
3149
3.93M
            }
3150
3151
4.95M
            ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode;
3152
3153
4.95M
            if(ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P3)
3154
3.93M
            {
3155
3.93M
                if(best_amode <= 32)
3156
3.92M
                    ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode + 2;
3157
3.93M
            }
3158
4.95M
        }
3159
2.09M
        else
3160
2.09M
        {
3161
2.09M
            ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[0]->best_mode;
3162
2.09M
            ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[1]->best_mode;
3163
2.09M
            ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[2]->best_mode;
3164
2.09M
            ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[3]->best_mode;
3165
2.09M
        }
3166
3167
        /* Add the left and top MPM modes for computation*/
3168
3169
7.05M
        ps_ctxt->au1_modes_to_eval_temp[i++] = cand_mode_satd_list[0];
3170
7.05M
        ps_ctxt->au1_modes_to_eval_temp[i++] = cand_mode_satd_list[1];
3171
3172
7.05M
        i_end = i;
3173
7.05M
        count = 0;
3174
3175
        /*Remove duplicate modes from modes_to_eval_temp[] */
3176
42.3M
        for(j = 0; j < i_end; j++)
3177
35.2M
        {
3178
77.3M
            for(k = 0; k < count; k++)
3179
44.8M
            {
3180
44.8M
                if(ps_ctxt->au1_modes_to_eval_temp[j] == ps_ctxt->au1_modes_to_eval[k])
3181
2.75M
                    break;
3182
44.8M
            }
3183
35.2M
            if((k == count) && (ps_ctxt->au1_modes_to_eval_temp[j] > 1))
3184
16.6M
            {
3185
16.6M
                ps_ctxt->au1_modes_to_eval[count] = ps_ctxt->au1_modes_to_eval_temp[j];
3186
16.6M
                count++;
3187
16.6M
            }
3188
35.2M
        }
3189
7.05M
        i_end = count;
3190
7.05M
        if(count == 0)
3191
994k
        {
3192
994k
            ps_ctxt->au1_modes_to_eval[0] = 26;
3193
994k
            i_end = 1;
3194
994k
        }
3195
3196
24.6M
        for(i = 0; i < i_end; i++)
3197
17.6M
        {
3198
17.6M
            pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3199
17.6M
            pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3200
3201
17.6M
            mode = ps_ctxt->au1_modes_to_eval[i];
3202
17.6M
            ASSERT((mode >= 2) && (mode <= 34));
3203
17.6M
            cost_amode_step2[i] = ps_ctxt->au2_mode_bits_satd_cost[mode];
3204
17.6M
            filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(trans_size) - 2));
3205
3206
42.0M
            for(k = 0; k < num_tu_in_y; k++)
3207
24.4M
            {
3208
62.7M
                for(j = 0; j < num_tu_in_x; j++)
3209
38.2M
                {
3210
38.2M
                    pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3211
3212
38.2M
                    if(0 == filter_flag)
3213
33.9M
                        pu1_ref = pu1_ref_orig;
3214
4.29M
                    else
3215
4.29M
                        pu1_ref = pu1_ref_filt;
3216
3217
38.2M
                    g_apf_lum_ip[g_i4_ip_funcs[mode]](
3218
38.2M
                        pu1_ref, 0, &ps_ctxt->au1_pred_samples[0], trans_size, trans_size, mode);
3219
3220
38.2M
                    if(ps_ctxt->u1_use_satd)
3221
36.3M
                    {
3222
36.3M
                        sad = apf_resd_trns_had[u1_trans_idx](
3223
36.3M
                            pu1_origin,
3224
36.3M
                            ps_curr_src->i4_y_strd,
3225
36.3M
                            &ps_ctxt->au1_pred_samples[0],
3226
36.3M
                            trans_size,
3227
36.3M
                            NULL,
3228
36.3M
                            0
3229
3230
36.3M
                        );
3231
36.3M
                    }
3232
1.90M
                    else
3233
1.90M
                    {
3234
1.90M
                        sad = ps_ctxt->s_ipe_optimised_function_list.pf_nxn_sad_computer(
3235
1.90M
                            pu1_origin,
3236
1.90M
                            ps_curr_src->i4_y_strd,
3237
1.90M
                            &ps_ctxt->au1_pred_samples[0],
3238
1.90M
                            trans_size,
3239
1.90M
                            trans_size);
3240
1.90M
                    }
3241
3242
38.2M
                    cost_amode_step2[i] += sad;
3243
3244
38.2M
                    pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3245
38.2M
                    pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3246
38.2M
                }
3247
24.4M
            }
3248
17.6M
        }
3249
7.05M
        best_amode = ps_ctxt->au1_modes_to_eval[0];
3250
        /*Init cost indx */
3251
7.05M
        cost_ang_mode = MAX_INTRA_COST_IPE;  //cost_amode_step2[0];
3252
24.6M
        for(z = 0; z < i_end; z++)
3253
17.6M
        {
3254
            /* Least cost of all 3 angles are stored in cost_amode_step2[0] and corr. mode*/
3255
17.6M
            if(cost_ang_mode >= cost_amode_step2[z])
3256
12.6M
            {
3257
12.6M
                if(cost_ang_mode == cost_amode_step2[z])
3258
1.21M
                {
3259
1.21M
                    if(best_amode > ps_ctxt->au1_modes_to_eval[z])
3260
15.9k
                        best_amode = ps_ctxt->au1_modes_to_eval[z];
3261
1.21M
                }
3262
11.4M
                else
3263
11.4M
                {
3264
11.4M
                    best_amode = ps_ctxt->au1_modes_to_eval[z];
3265
11.4M
                }
3266
12.6M
                cost_ang_mode = cost_amode_step2[z];
3267
12.6M
            }
3268
17.6M
        }
3269
3270
        /*Modify mode bits for the angular modes */
3271
7.05M
    }
3272
3273
7.05M
    {
3274
        /* Step - I modification */
3275
7.05M
        ASSERT((best_amode >= 2) && (best_amode <= 34));
3276
7.05M
        i_end = 0;
3277
7.05M
        z = 0;
3278
3279
        /* Around best level 3 angular mode, search for best level 1 mode */
3280
7.05M
        ps_ctxt->au1_modes_to_eval[i_end++] = 0;
3281
7.05M
        ps_ctxt->au1_modes_to_eval[i_end++] = 1;
3282
3283
7.05M
        if(best_amode != 2)
3284
6.86M
            ps_ctxt->au1_modes_to_eval[i_end++] = best_amode - 1;
3285
3286
7.05M
        ps_ctxt->au1_modes_to_eval[i_end++] = best_amode;
3287
3288
7.05M
        if(best_amode != 34)
3289
6.99M
            ps_ctxt->au1_modes_to_eval[i_end++] = best_amode + 1;
3290
3291
        /* Inserting step_2's best mode at last to avoid
3292
        recalculation of it's SATD cost */
3293
3294
        //ps_ctxt->au1_modes_to_eval[i_end] = best_amode; //Bugfix: HSAD compared with SAD
3295
        //cost_amode_step2[i_end] = cost_ang_mode;
3296
3297
        /*best_sad[i_end] = cost_ang_mode
3298
                - mode_bits_satd_cost[best_amode]; //See NOTE_A01 above */
3299
3300
7.05M
        cost_ang_mode = MAX_INTRA_COST_IPE; /* Init cost */
3301
3302
42.0M
        for(i = 0; i < i_end; i++)
3303
35.0M
        {
3304
35.0M
            WORD32 u1_trans_idx = trans_size >> 3;
3305
35.0M
            if(trans_size == 32)
3306
2.87M
                u1_trans_idx = 3;
3307
35.0M
            pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3308
35.0M
            pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3309
3310
            /*best_sad[i] = 0; //See NOTE_A01 above */
3311
35.0M
            mode = ps_ctxt->au1_modes_to_eval[i];
3312
35.0M
            cost_amode_step2[i] = ps_ctxt->au2_mode_bits_satd_cost[mode];
3313
35.0M
            filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(trans_size) - 2));
3314
3315
82.2M
            for(k = 0; k < num_tu_in_y; k++)
3316
47.2M
            {
3317
118M
                for(j = 0; j < num_tu_in_x; j++)
3318
71.6M
                {
3319
71.6M
                    pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3320
3321
71.6M
                    if(0 == filter_flag)
3322
57.6M
                        pu1_ref = pu1_ref_orig;
3323
13.9M
                    else
3324
13.9M
                        pu1_ref = pu1_ref_filt;
3325
3326
71.6M
                    g_apf_lum_ip[g_i4_ip_funcs[mode]](
3327
71.6M
                        pu1_ref, 0, &ps_ctxt->au1_pred_samples[0], trans_size, trans_size, mode);
3328
3329
                    //if(trans_size != 4)
3330
71.6M
                    {
3331
71.6M
                        sad = apf_resd_trns_had[u1_trans_idx](
3332
71.6M
                            pu1_origin,
3333
71.6M
                            ps_curr_src->i4_y_strd,
3334
71.6M
                            &ps_ctxt->au1_pred_samples[0],
3335
71.6M
                            trans_size,
3336
71.6M
                            NULL,
3337
71.6M
                            0);
3338
71.6M
                    }
3339
3340
                    /*accumualting SATD though name says it is sad*/
3341
71.6M
                    cost_amode_step2[i] += sad;
3342
                    /*best_sad[i] +=sad; //See NOTE_A01 above */
3343
71.6M
                    pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3344
71.6M
                    pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3345
71.6M
                }
3346
47.2M
            }
3347
35.0M
        }
3348
        /* Updating i_end for the step_2's inserted mode*/
3349
        //        i_end++;
3350
3351
        /* Arrange the reference array in ascending order */
3352
3353
35.0M
        for(i = 0; i < (i_end - 1); i++)
3354
27.9M
        {
3355
97.4M
            for(j = i + 1; j < i_end; j++)
3356
69.5M
            {
3357
69.5M
                if(cost_amode_step2[i] > cost_amode_step2[j])
3358
13.6M
                {
3359
13.6M
                    temp = cost_amode_step2[i];
3360
13.6M
                    cost_amode_step2[i] = cost_amode_step2[j];
3361
13.6M
                    cost_amode_step2[j] = temp;
3362
3363
13.6M
                    temp = modes_4x4[i];
3364
13.6M
                    modes_4x4[i] = modes_4x4[j];
3365
13.6M
                    modes_4x4[j] = temp;
3366
13.6M
                }
3367
69.5M
            }
3368
27.9M
        }
3369
3370
        /* Least cost of all 3 angles are stored in cost_amode_step2[0] and corr. mode*/
3371
7.05M
        best_amode = ps_ctxt->au1_modes_to_eval[modes_4x4[0]];
3372
7.05M
        cost_ang_mode = cost_amode_step2[0];
3373
7.05M
        ps_cu_node->best_satd = cost_ang_mode - ps_ctxt->au2_mode_bits_satd_cost[best_amode];
3374
7.05M
        ps_cu_node->best_cost = cost_amode_step2[0];
3375
7.05M
        ps_cu_node->best_mode = ps_ctxt->au1_modes_to_eval[modes_4x4[0]];
3376
7.05M
        ps_cu_node->best_satd =
3377
7.05M
            ps_cu_node->best_cost - ps_ctxt->au2_mode_bits_satd_cost[ps_cu_node->best_mode];
3378
3379
        /*Accumalate best mode bits cost for RC*/
3380
7.05M
        ps_cu_node->u2_mode_bits_cost = ps_ctxt->au2_mode_bits_satd[ps_cu_node->best_mode];
3381
3382
        /* Store the best three candidates */
3383
28.2M
        for(i = 0; i < 3; i++)
3384
21.1M
        {
3385
21.1M
            best_costs_4x4[i] = cost_amode_step2[i];
3386
21.1M
            best_modes_4x4[i] = ps_ctxt->au1_modes_to_eval[modes_4x4[i]];
3387
21.1M
        }
3388
7.05M
    }
3389
3390
7.05M
    return;
3391
7.05M
}