Coverage Report

Created: 2026-06-13 06:29

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/encoder/ihevce_recur_bracketing.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/*!
22
******************************************************************************
23
* \file ihevce_recur_bracketing.c
24
*
25
* \brief
26
*    This file contains interface functions of recursive bracketing
27
*    module
28
* \date
29
*    12/02/2012
30
*
31
* \author
32
*    Ittiam
33
*
34
* List of Functions
35
*
36
*
37
******************************************************************************
38
*/
39
40
/*****************************************************************************/
41
/* File Includes                                                             */
42
/*****************************************************************************/
43
/* System include files */
44
#include <stdio.h>
45
#include <string.h>
46
#include <stdlib.h>
47
#include <assert.h>
48
#include <stdarg.h>
49
#include <math.h>
50
51
/* User include files */
52
#include "ihevc_typedefs.h"
53
#include "itt_video_api.h"
54
#include "ihevce_api.h"
55
56
#include "rc_cntrl_param.h"
57
#include "rc_frame_info_collector.h"
58
#include "rc_look_ahead_params.h"
59
60
#include "ihevc_defs.h"
61
#include "ihevc_structs.h"
62
#include "ihevc_platform_macros.h"
63
#include "ihevc_deblk.h"
64
#include "ihevc_itrans_recon.h"
65
#include "ihevc_chroma_itrans_recon.h"
66
#include "ihevc_chroma_intra_pred.h"
67
#include "ihevc_intra_pred.h"
68
#include "ihevc_inter_pred.h"
69
#include "ihevc_mem_fns.h"
70
#include "ihevc_padding.h"
71
#include "ihevc_weighted_pred.h"
72
#include "ihevc_sao.h"
73
#include "ihevc_resi_trans.h"
74
#include "ihevc_quant_iquant_ssd.h"
75
#include "ihevc_cabac_tables.h"
76
77
#include "ihevce_defs.h"
78
#include "ihevce_lap_enc_structs.h"
79
#include "ihevce_multi_thrd_structs.h"
80
#include "ihevce_me_common_defs.h"
81
#include "ihevce_had_satd.h"
82
#include "ihevce_error_codes.h"
83
#include "ihevce_bitstream.h"
84
#include "ihevce_cabac.h"
85
#include "ihevce_rdoq_macros.h"
86
#include "ihevce_function_selector.h"
87
#include "ihevce_enc_structs.h"
88
#include "ihevce_entropy_structs.h"
89
#include "ihevce_cmn_utils_instr_set_router.h"
90
#include "ihevce_enc_loop_structs.h"
91
#include "ihevce_ipe_instr_set_router.h"
92
#include "ihevce_ipe_structs.h"
93
#include "ihevce_ipe_pass.h"
94
#include "ihevce_recur_bracketing.h"
95
#include "ihevce_nbr_avail.h"
96
#include "ihevc_common_tables.h"
97
#include "ihevce_decomp_pre_intra_structs.h"
98
#include "ihevce_decomp_pre_intra_pass.h"
99
100
#include "cast_types.h"
101
#include "osal.h"
102
#include "osal_defaults.h"
103
104
/*****************************************************************************/
105
/* Constant Macros                                                           */
106
/*****************************************************************************/
107
#define IP_DBG_L1_l2 0
108
246k
#define CHILD_BIAS 12
109
110
/*****************************************************************************/
111
/* Globals                                                                   */
112
/*****************************************************************************/
113
extern pf_intra_pred g_apf_lum_ip[10];
114
115
extern WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES];
116
117
UWORD8 gau1_cu_pos_x[64] = { 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7,
118
                             6, 7, 4, 5, 4, 5, 6, 7, 6, 7, 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1,
119
                             2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7 };
120
121
UWORD8 gau1_cu_pos_y[64] = { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 0, 0, 1, 1, 0, 0,
122
                             1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7,
123
                             6, 6, 7, 7, 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7, 6, 6, 7, 7 };
124
125
#define RESET_BIT(x, bit) (x = x & ~((WORD32)1 << bit))
126
127
/*****************************************************************************/
128
/* Function Definitions                                                      */
129
/*****************************************************************************/
130
131
/*!
132
******************************************************************************
133
* \if Function name : ihevce_update_cand_list \endif
134
*
135
* \brief
136
*    Final Candidate list population, nbr flag andd nbr mode update function
137
*
138
* \param[in] ps_row_cu : pointer to cu analyse struct
139
* \param[in] ps_cu_node : pointer to cu node info buffer
140
* \param[in] ps_ed_blk_l1 : pointer to level 1 and 2 decision buffer
141
* \param[in] pu1_cand_mode_list  : pointer to candidate list buffer
142
*
143
* \return
144
*    None
145
*
146
* \author
147
*  Ittiam
148
*
149
*****************************************************************************
150
*/
151
void ihevce_update_cand_list(
152
    ihevce_ipe_cu_tree_t *ps_cu_node, ihevce_ed_blk_t *ps_ed_blk_l1, ihevce_ipe_ctxt_t *ps_ctxt)
153
182k
{
154
182k
    WORD32 row, col, x, y, size;
155
156
    /* Candidate mode Update */
157
182k
    (void)ps_ed_blk_l1;
158
    /* Update CTB mode map for the finalised CU */
159
182k
    x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
160
182k
    y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
161
182k
    size = ps_cu_node->u1_cu_size >> 2;
162
733k
    for(row = y; row < (y + size); row++)
163
551k
    {
164
2.81M
        for(col = x; col < (x + size); col++)
165
2.25M
        {
166
2.25M
            ps_ctxt->au1_ctb_mode_map[row][col] = ps_cu_node->best_mode;
167
2.25M
        }
168
551k
    }
169
182k
    return;
170
182k
}
171
172
/*!
173
******************************************************************************
174
* \if Function name : ihevce_intra_populate_mode_bits_cost_bracketing \endif
175
*
176
* \brief
177
*    Mpm indx calc function based on left and top available modes
178
*
179
* \param[in] top_intra_mode : Top available intra mode
180
* \param[in] left_intra_mode : Left available intra mode
181
* \param[in] available_top : Top availability flag
182
* \param[in] available_left : Left availability flag
183
* \param[in] cu_pos_y : cu position wrt to CTB
184
* \param[in] mode_bits_cost : pointer to mode bits buffer
185
* \param[in] lambda : Lambda value (SAD/SATD)
186
* \param[in] cand_mode_list  : pointer to candidate list buffer
187
*
188
* \return
189
*    None
190
*
191
* \author
192
*  Ittiam
193
*
194
*****************************************************************************
195
*/
196
void ihevce_intra_populate_mode_bits_cost_bracketing(
197
    WORD32 top_intra_mode,
198
    WORD32 left_intra_mode,
199
    WORD32 available_top,
200
    WORD32 available_left,
201
    WORD32 cu_pos_y,
202
    UWORD16 *mode_bits_cost,
203
    UWORD16 *mode_bits,
204
    WORD32 lambda,
205
    WORD32 *cand_mode_list)
206
1.44M
{
207
    /* local variables */
208
1.44M
    WORD32 i;
209
1.44M
    WORD32 cand_intra_pred_mode_left, cand_intra_pred_mode_top;
210
211
1.44M
    UWORD16 one_bits_cost =
212
1.44M
        COMPUTE_RATE_COST_CLIP30(4, lambda, (LAMBDA_Q_SHIFT + 1));  //1.5 * lambda
213
1.44M
    UWORD16 two_bits_cost =
214
1.44M
        COMPUTE_RATE_COST_CLIP30(6, lambda, (LAMBDA_Q_SHIFT + 1));  //2.5 * lambda
215
1.44M
    UWORD16 five_bits_cost =
216
1.44M
        COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1));  //5.5 * lambda
217
218
52.0M
    for(i = 0; i < 35; i++)
219
50.6M
    {
220
50.6M
        mode_bits_cost[i] = five_bits_cost;
221
50.6M
        mode_bits[i] = 5;
222
50.6M
    }
223
224
    /* EIID: set availability flag to zero if modes are invalid.
225
       Required since some CU's might be skipped (though available)
226
       and their modes will be set to 255 (-1)*/
227
1.44M
    if(35 < top_intra_mode || 0 > top_intra_mode)
228
0
        available_top = 0;
229
1.44M
    if(35 < left_intra_mode || 0 > left_intra_mode)
230
0
        available_left = 0;
231
232
    /* Calculate cand_intra_pred_mode_N as per sec. 8.4.2 in JCTVC-J1003_d7 */
233
    /* N = top */
234
1.44M
    if(0 == available_top)
235
88.8k
    {
236
88.8k
        cand_intra_pred_mode_top = INTRA_DC;
237
88.8k
    }
238
    /* for neighbour != INTRA, setting DC is done outside */
239
1.35M
    else if(0 == cu_pos_y) /* It's on the CTB boundary */
240
190k
    {
241
190k
        cand_intra_pred_mode_top = INTRA_DC;
242
190k
    }
243
1.16M
    else
244
1.16M
    {
245
1.16M
        cand_intra_pred_mode_top = top_intra_mode;
246
1.16M
    }
247
248
    /* N = left */
249
1.44M
    if(0 == available_left)
250
78.9k
    {
251
78.9k
        cand_intra_pred_mode_left = INTRA_DC;
252
        //cand_intra_pred_mode_left = cand_intra_pred_mode_top;
253
78.9k
    }
254
    /* for neighbour != INTRA, setting DC is done outside */
255
1.36M
    else
256
1.36M
    {
257
1.36M
        cand_intra_pred_mode_left = left_intra_mode;
258
1.36M
    }
259
260
    /* Calculate cand_mode_list as per sec. 8.4.2 in JCTVC-J1003_d7 */
261
1.44M
    if(cand_intra_pred_mode_left == cand_intra_pred_mode_top)
262
445k
    {
263
445k
        if(cand_intra_pred_mode_left < 2)
264
244k
        {
265
244k
            cand_mode_list[0] = INTRA_PLANAR;
266
244k
            cand_mode_list[1] = INTRA_DC;
267
244k
            cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
268
244k
        }
269
201k
        else
270
201k
        {
271
201k
            cand_mode_list[0] = cand_intra_pred_mode_left;
272
201k
            cand_mode_list[1] = 2 + ((cand_intra_pred_mode_left + 29) % 32);
273
201k
            cand_mode_list[2] = 2 + ((cand_intra_pred_mode_left - 2 + 1) % 32);
274
201k
        }
275
445k
    }
276
1.00M
    else
277
1.00M
    {
278
1.00M
        if(0 == available_left)
279
50.4k
        {
280
50.4k
            cand_mode_list[0] = cand_intra_pred_mode_top;
281
50.4k
            cand_mode_list[1] = cand_intra_pred_mode_left;
282
50.4k
        }
283
950k
        else
284
950k
        {
285
950k
            cand_mode_list[0] = cand_intra_pred_mode_left;
286
950k
            cand_mode_list[1] = cand_intra_pred_mode_top;
287
950k
        }
288
1.00M
        if((cand_intra_pred_mode_left != INTRA_PLANAR) &&
289
815k
           (cand_intra_pred_mode_top != INTRA_PLANAR))
290
691k
        {
291
691k
            cand_mode_list[2] = INTRA_PLANAR;
292
691k
        }
293
309k
        else if((cand_intra_pred_mode_left != INTRA_DC) && (cand_intra_pred_mode_top != INTRA_DC))
294
159k
        {
295
159k
            cand_mode_list[2] = INTRA_DC;
296
159k
        }
297
150k
        else
298
150k
        {
299
150k
            cand_mode_list[2] = INTRA_ANGULAR(26);
300
150k
        }
301
1.00M
    }
302
1.44M
    mode_bits_cost[cand_mode_list[0]] = one_bits_cost;
303
1.44M
    mode_bits_cost[cand_mode_list[1]] = two_bits_cost;
304
1.44M
    mode_bits_cost[cand_mode_list[2]] = two_bits_cost;
305
306
1.44M
    mode_bits[cand_mode_list[0]] = 2;
307
1.44M
    mode_bits[cand_mode_list[1]] = 3;
308
1.44M
    mode_bits[cand_mode_list[2]] = 3;
309
1.44M
}
310
311
/*!
312
******************************************************************************
313
* \if Function name : ihevce_pu_calc_4x4_blk \endif
314
*
315
* \brief
316
*    4x4 pu (8x8 CU) mode decision using step 8421 method
317
*
318
* \param[in] ps_cu_node : pointer to cu node info buffer
319
* \param[in] pu1_src : pointer to src pixels
320
* \param[in] src_stride : frm source stride
321
* \param[in] ref : pointer to reference pixels for prediction
322
* \param[in] cand_mode_list  : pointer to candidate list buffer
323
* \param[in] best_costs_4x4  : pointer to 3 best cost buffer
324
* \param[in] best_modes_4x4  : pointer to 3 best mode buffer
325
*
326
* \return
327
*    None
328
*
329
* \author
330
*  Ittiam
331
*
332
*****************************************************************************
333
*/
334
void ihevce_pu_calc_4x4_blk(
335
    ihevce_ipe_ctxt_t *ps_ctxt,
336
    ihevce_ipe_cu_tree_t *ps_cu_node,
337
    UWORD8 *pu1_src,
338
    WORD32 src_stride,
339
    UWORD8 *ref,
340
    UWORD16 *mode_bits_cost,
341
    WORD32 *best_costs_4x4,
342
    UWORD8 *best_modes_4x4,
343
    func_selector_t *ps_func_selector)
344
749k
{
345
749k
    WORD16 *pi2_trans_tmp = ps_ctxt->pi2_trans_tmp;
346
749k
    WORD16 *pi2_trans_out = ps_ctxt->pi2_trans_out;
347
749k
    UWORD8 u1_use_satd = ps_ctxt->u1_use_satd;
348
749k
    UWORD8 u1_level_1_refine_on = ps_ctxt->u1_level_1_refine_on;
349
350
749k
    WORD32 i, j = 0, i_end;
351
749k
    UWORD8 mode, best_amode = 255;
352
749k
    UWORD8 pred[16];
353
354
749k
    UWORD16 sad;
355
749k
    WORD32 sad_cost = 0;
356
749k
    WORD32 best_asad_cost = 0xFFFFF;
357
749k
    WORD32 temp;
358
749k
    UWORD8 modes_to_eval[5];
359
749k
    WORD32 costs_4x4[5];
360
749k
    UWORD8 modes_4x4[5] = { 0, 1, 2, 3, 4 };
361
362
    /* LO resolution hence low resolution disable */
363
749k
    WORD32 u1_low_resol = 0;
364
749k
    UWORD8 au1_best_modes[1] = { 0 };
365
749k
    WORD32 ai4_best_sad_costs[1] = { 0 };
366
367
749k
    WORD16 *pi2_tmp = &pi2_trans_tmp[0];
368
369
749k
    ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list =
370
749k
        &ps_ctxt->s_ipe_optimised_function_list;
371
372
    //apf_resd_trns[0] = &ihevc_resi_trans_4x4_ttype1;
373
    //apf_resd_trns[0] = &ihevc_HAD_4x4_8bit;
374
375
4.49M
    for(i = 0; i < 5; i++)
376
3.74M
    {
377
3.74M
        costs_4x4[i] = MAX_INTRA_COST_IPE;
378
3.74M
    }
379
380
749k
    ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes(
381
749k
        pu1_src,
382
749k
        src_stride,
383
749k
        ref,
384
749k
        mode_bits_cost,
385
749k
        au1_best_modes,
386
749k
        ai4_best_sad_costs,
387
749k
        u1_low_resol,
388
749k
        ps_ipe_optimised_function_list->pf_4x4_sad_computer);
389
390
749k
    best_amode = au1_best_modes[0];
391
749k
    best_asad_cost = ai4_best_sad_costs[0];
392
393
749k
    ASSERT(best_amode != 255);
394
    /* Around best level 4 angular mode, search for best level 2 mode */
395
749k
    modes_to_eval[0] = best_amode - 2;
396
749k
    modes_to_eval[1] = best_amode + 2;
397
749k
    i = 0;
398
749k
    i_end = 2;
399
749k
    if(best_amode == 2)
400
60.0k
        i = 1;
401
689k
    else if(best_amode == 34)
402
24.5k
        i_end = 1;
403
2.16M
    for(; i < i_end; i++)
404
1.41M
    {
405
1.41M
        mode = modes_to_eval[i];
406
407
1.41M
        g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
408
409
1.41M
        sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
410
411
1.41M
        sad_cost = sad;
412
1.41M
        sad_cost += mode_bits_cost[mode];
413
414
1.41M
        if(sad_cost < best_asad_cost)
415
110k
        {
416
110k
            best_amode = mode;
417
110k
            best_asad_cost = sad_cost;
418
110k
        }
419
1.41M
    }
420
421
    /* Around best level 2 angular mode, search for best level 1 mode */
422
    /* Also evaluate for non-angular mode */
423
424
749k
    i = 0;
425
    /*Level 1 refinement is disabled for ES preset */
426
749k
    if(1 == u1_level_1_refine_on)
427
749k
    {
428
749k
        if(best_amode != 2)
429
694k
            modes_to_eval[i++] = best_amode - 1;
430
749k
        modes_to_eval[i++] = best_amode;
431
749k
    }
432
433
749k
    modes_to_eval[i++] = 0;
434
749k
    modes_to_eval[i++] = 1;
435
436
749k
    if(1 == u1_level_1_refine_on)
437
749k
    {
438
749k
        if(best_amode != 34)
439
727k
            modes_to_eval[i++] = best_amode + 1;
440
749k
    }
441
749k
    i_end = i;
442
749k
    i = 0;
443
444
4.41M
    for(; i < i_end; i++)
445
3.66M
    {
446
3.66M
        mode = modes_to_eval[i];
447
448
3.66M
        g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
449
450
        /* Hard coding to use SATD */
451
3.66M
        if(u1_use_satd)
452
2.74M
        {
453
2.74M
            ps_func_selector->ihevc_resi_trans_4x4_ttype1_fptr(
454
2.74M
                pu1_src, &pred[0], (WORD32 *)pi2_tmp, pi2_trans_out, src_stride, 4, 4, NULL_PLANE);
455
456
2.74M
            sad = ihevce_ipe_pass_satd(pi2_trans_out, 4, 4);
457
2.74M
        }
458
922k
        else
459
922k
        {
460
922k
            sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(
461
922k
                pu1_src, &pred[0], src_stride, 4);
462
922k
        }
463
3.66M
        sad_cost = sad;
464
3.66M
        sad_cost += mode_bits_cost[mode];
465
466
3.66M
        costs_4x4[i] = sad_cost;
467
3.66M
    }
468
469
    /* Arrange the reference array in ascending order */
470
3.66M
    for(i = 0; i < (i_end - 1); i++)
471
2.91M
    {
472
10.1M
        for(j = i + 1; j < i_end; j++)
473
7.18M
        {
474
7.18M
            if(costs_4x4[i] > costs_4x4[j])
475
2.73M
            {
476
2.73M
                temp = costs_4x4[i];
477
2.73M
                costs_4x4[i] = costs_4x4[j];
478
2.73M
                costs_4x4[j] = temp;
479
480
2.73M
                temp = modes_4x4[i];
481
2.73M
                modes_4x4[i] = modes_4x4[j];
482
2.73M
                modes_4x4[j] = temp;
483
2.73M
            }
484
7.18M
        }
485
2.91M
    }
486
2.99M
    for(i = 0; i < 3; i++)
487
2.24M
    {
488
2.24M
        best_costs_4x4[i] = costs_4x4[i];
489
2.24M
        best_modes_4x4[i] = modes_to_eval[modes_4x4[i]];
490
2.24M
    }
491
492
749k
    {
493
749k
        ps_cu_node->best_mode = best_modes_4x4[0];
494
749k
        ps_cu_node->best_cost = best_costs_4x4[0];
495
749k
        ps_cu_node->best_satd = best_costs_4x4[0] - mode_bits_cost[ps_cu_node->best_mode];
496
749k
    }
497
749k
}
498
499
/*!
500
******************************************************************************
501
* \if Function name : ihevce_pu_calc_8x8_blk \endif
502
*
503
* \brief
504
*    4x4 pu (8x8 CU) mode decision loop using step 8421 method
505
*
506
* \param[in] ps_curr_src : pointer to src pixels struct
507
* \param[in] ps_ctxt : pointer to IPE context struct
508
* \param[in] ps_cu_node : pointer to cu node info buffer
509
*
510
* \return
511
*    None
512
*
513
* \author
514
*  Ittiam
515
*
516
*****************************************************************************
517
*/
518
void ihevce_pu_calc_8x8_blk(
519
    iv_enc_yuv_buf_t *ps_curr_src,
520
    ihevce_ipe_ctxt_t *ps_ctxt,
521
    ihevce_ipe_cu_tree_t *ps_cu_node,
522
    func_selector_t *ps_func_selector)
523
187k
{
524
187k
    WORD32 i, j;
525
187k
    WORD32 nbr_flags;
526
187k
    nbr_avail_flags_t s_nbr;
527
187k
    WORD32 trans_size = ps_cu_node->ps_parent->u1_cu_size >> 1;
528
529
187k
    UWORD8 *pu1_src_4x4;
530
187k
    WORD32 xA, xB, yA, yB;
531
    //WORD32 x, y, size;
532
187k
    WORD32 top_intra_mode;
533
187k
    WORD32 left_intra_mode;
534
    //    WORD8 *top_intra_mode_ptr;
535
    //  WORD8 *left_intra_mode_ptr;
536
187k
    UWORD8 *pu1_orig;
537
187k
    WORD32 src_strd = ps_curr_src->i4_y_strd;
538
539
187k
    WORD32 cu_pos_x = ps_cu_node->ps_parent->u2_x0 << 1;
540
187k
    WORD32 cu_pos_y = ps_cu_node->ps_parent->u2_y0 << 1;
541
187k
    ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
542
543
187k
    ihevc_intra_pred_luma_ref_substitution_fptr =
544
187k
        ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
545
546
187k
    pu1_orig = (UWORD8 *)(ps_curr_src->pv_y_buf) +
547
187k
               ((ps_cu_node->ps_parent->u2_y0 << 3) * src_strd) +
548
187k
               (ps_cu_node->ps_parent->u2_x0 << 3);
549
561k
    for(i = 0; i < 2; i++)
550
374k
    {
551
1.12M
        for(j = 0; j < 2; j++)
552
749k
        {
553
749k
            WORD32 cand_mode_list[3];
554
749k
            pu1_src_4x4 = pu1_orig + (i * trans_size * src_strd) + (j * trans_size);
555
            /* get the neighbour availability flags */
556
749k
            nbr_flags = ihevce_get_nbr_intra(
557
749k
                &s_nbr,
558
749k
                ps_ctxt->pu1_ctb_nbr_map,
559
749k
                ps_ctxt->i4_nbr_map_strd,
560
749k
                cu_pos_x + ((j) * (trans_size >> 2)),
561
749k
                cu_pos_y + ((i) * (trans_size >> 2)),
562
749k
                trans_size >> 2);
563
564
            /* call the function which populates sad cost for all the modes */
565
749k
            xA = ((ps_cu_node->ps_parent->u2_x0 << 3) >> 2) + j;
566
749k
            yA = ((ps_cu_node->ps_parent->u2_y0 << 3) >> 2) + 1 + i;
567
749k
            xB = xA + 1;
568
749k
            yB = yA - 1;
569
749k
            left_intra_mode = ps_ctxt->au1_ctb_mode_map[yA][xA];
570
749k
            top_intra_mode = ps_ctxt->au1_ctb_mode_map[yB][xB];
571
572
749k
            ihevce_intra_populate_mode_bits_cost_bracketing(
573
749k
                top_intra_mode,
574
749k
                left_intra_mode,
575
749k
                s_nbr.u1_top_avail,
576
749k
                s_nbr.u1_left_avail,
577
749k
                ps_cu_node->ps_parent->u2_y0,
578
749k
                &ps_ctxt->au2_mode_bits_cost_8x8pu[i * 2 + j][0],
579
749k
                &ps_ctxt->au2_mode_bits_8x8_pu[0],
580
749k
                ps_ctxt->i4_ol_sad_lambda,
581
749k
                cand_mode_list);
582
583
            /* call the function which populates ref data for intra predicion */
584
749k
            ihevc_intra_pred_luma_ref_substitution_fptr(
585
749k
                pu1_src_4x4 - src_strd - 1,
586
749k
                pu1_src_4x4 - src_strd,
587
749k
                pu1_src_4x4 - 1,
588
749k
                src_strd,
589
749k
                4,
590
749k
                nbr_flags,
591
749k
                &ps_ctxt->au1_ref_8x8pu[i * 2 + j][0],
592
749k
                0);
593
594
749k
            ihevce_pu_calc_4x4_blk(
595
749k
                ps_ctxt,
596
749k
                ps_cu_node->ps_sub_cu[(i * 2) + j],
597
749k
                pu1_src_4x4,
598
749k
                src_strd,
599
749k
                &ps_ctxt->au1_ref_8x8pu[i * 2 + j][0],
600
749k
                &ps_ctxt->au2_mode_bits_cost_8x8pu[i * 2 + j][0],
601
749k
                &ps_cu_node->ps_sub_cu[(i * 2) + j]->au4_best_cost_1tu[0],
602
749k
                &ps_cu_node->ps_sub_cu[(i * 2) + j]->au1_best_mode_1tu[0],
603
749k
                ps_func_selector);
604
605
            /*&au4_cost_4x4[i*2 + j][0],
606
                &au1_modes_4x4[i*2 + j][0]);*/ //TTODO : mode will change for the four partition
607
608
749k
            ihevce_set_nbr_map(
609
749k
                ps_ctxt->pu1_ctb_nbr_map,
610
749k
                ps_ctxt->i4_nbr_map_strd,
611
749k
                cu_pos_x + ((j) * (trans_size >> 2)),
612
749k
                cu_pos_y + ((i) * (trans_size >> 2)),
613
749k
                (trans_size >> 2),
614
749k
                1);
615
616
749k
            xA = ((ps_cu_node->ps_parent->u2_x0 << 3) >> 2) + 1 + j;
617
749k
            yA = ((ps_cu_node->ps_parent->u2_y0 << 3) >> 2) + 1 + i;
618
749k
            ps_ctxt->au1_ctb_mode_map[yA][xA] = ps_cu_node->ps_sub_cu[i * 2 + j]->best_mode;
619
749k
            ps_cu_node->ps_sub_cu[i * 2 + j]->u2_mode_bits_cost =
620
749k
                ps_ctxt->au2_mode_bits_8x8_pu[ps_cu_node->ps_sub_cu[i * 2 + j]->best_mode];
621
749k
        }
622
374k
    }
623
187k
}
624
625
/*!
626
******************************************************************************
627
* \if Function name : ihevce_bracketing_analysis \endif
628
*
629
* \brief
630
*    Interface function that evaluates MAX cu and MAX - 1 cu, with MAX cu size
631
*    info decided coarse resolution mode decision. Compares the SATD/SAD cost btwn
632
*    2 CUS and determines the actual CU size and best 3 modes to be given to rdopt
633
*
634
* \param[in] ps_ctxt : pointer to IPE context struct
635
* \param[in] ps_cu_node : pointer to cu node info buffer
636
* \param[in] ps_curr_src : pointer to src pixels struct
637
* \param[in] ps_ctb_out : pointer to ip ctb out struct
638
* \param[in] ps_row_cu : pointer to cu analyse struct
639
* \param[in] ps_ed_l1_ctb : pointer to level 1 early deci struct
640
* \param[in] ps_ed_l2_ctb : pointer to level 2 early deci struct
641
* \param[in] ps_l0_ipe_out_ctb : pointer to ipe_l0_ctb_analyse_for_me_t struct
642
*
643
* \return
644
*    None
645
*
646
* \author
647
*  Ittiam
648
*
649
*****************************************************************************
650
*/
651
void ihevce_bracketing_analysis(
652
    ihevce_ipe_ctxt_t *ps_ctxt,
653
    ihevce_ipe_cu_tree_t *ps_cu_node,
654
    iv_enc_yuv_buf_t *ps_curr_src,
655
    ctb_analyse_t *ps_ctb_out,
656
    //cu_analyse_t         *ps_row_cu,
657
    ihevce_ed_blk_t *ps_ed_l1_ctb,
658
    ihevce_ed_blk_t *ps_ed_l2_ctb,
659
    ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
660
    ipe_l0_ctb_analyse_for_me_t *ps_l0_ipe_out_ctb)
661
14.0k
{
662
14.0k
    WORD32 cu_pos_x = 0;
663
14.0k
    WORD32 cu_pos_y = 0;
664
665
14.0k
    UWORD8 u1_curr_ctb_wdt = ps_cu_node->u1_width;
666
14.0k
    UWORD8 u1_curr_ctb_hgt = ps_cu_node->u1_height;
667
14.0k
    WORD32 num_8x8_blks_x = (u1_curr_ctb_wdt >> 3);
668
14.0k
    WORD32 num_8x8_blks_y = (u1_curr_ctb_hgt >> 3);
669
670
14.0k
    ihevce_ed_blk_t *ps_ed_blk_l1 = ps_ed_l1_ctb;
671
14.0k
    ihevce_ed_blk_t *ps_ed_blk_l2 = ps_ed_l2_ctb;
672
673
14.0k
    WORD32 i;
674
14.0k
    WORD32 cand_mode_list[3];
675
    //cu_analyse_t *ps_curr_cu = ps_row_cu;
676
14.0k
    WORD32 blk_cnt = 0;
677
14.0k
    WORD32 j = 0;
678
14.0k
    WORD32 merge_32x32_l1, merge_32x32_l2;
679
680
14.0k
    WORD32 i4_skip_intra_eval_32x32_l1;
681
    //EIID: flag indicating number of 16x16 blocks to be skipped for intra evaluation within 32x32 block
682
683
14.0k
    WORD32 parent_cost = 0;
684
14.0k
    WORD32 child_cost[4] = { 0 };
685
14.0k
    WORD32 child_cost_least = 0;
686
14.0k
    WORD32 child_satd[4] = { 0 };
687
14.0k
    WORD32 x, y, size;
688
14.0k
    WORD32 merge_64x64 = 1;
689
14.0k
    UWORD8 au1_best_32x32_modes[4];
690
14.0k
    WORD32 au4_best_32x32_cost[4];
691
14.0k
    WORD32 parent_best_mode;
692
14.0k
    UWORD8 best_mode;
693
694
14.0k
    WORD32 i4_quality_preset = ps_ctxt->i4_quality_preset;
695
    /* flag to control 1CU-4TU modes based on quality preset                */
696
    /* if set 1CU-4TU are explicity evaluated else 1CU-1TU modes are copied */
697
14.0k
    WORD32 i4_enable_1cu_4tu = (i4_quality_preset == IHEVCE_QUALITY_P2) ||
698
13.1k
                               (i4_quality_preset == IHEVCE_QUALITY_P0);
699
700
    /* flag to control 4CU-16TU mode based on quality preset                */
701
    /* if set 4CU-16TU are explicity evaluated else 4CU-4TU modes are copied*/
702
14.0k
    WORD32 i4_enable_4cu_16tu = (i4_quality_preset == IHEVCE_QUALITY_P2) ||
703
13.1k
                                (i4_quality_preset == IHEVCE_QUALITY_P0);
704
705
14.0k
    WORD32 i4_mod_factor_num, i4_mod_factor_den = QP_MOD_FACTOR_DEN;  //2;
706
14.0k
    float f_strength;
707
    /* Accumalte satd */
708
14.0k
    LWORD64 i8_frame_acc_satd_cost = 0, i8_frame_acc_satd_by_modqp_q10 = 0;
709
14.0k
    WORD32 i4_ctb_acc_satd = 0;
710
711
    /* Accumalate Mode bits cost */
712
14.0k
    LWORD64 i8_frame_acc_mode_bits_cost = 0;
713
714
    /* Step2 is bypassed for parent, uses children modes*/
715
14.0k
    WORD32 step2_bypass = 1;
716
717
14.0k
    if(1 == ps_ctxt->u1_disable_child_cu_decide)
718
0
        step2_bypass = 0;
719
720
14.0k
    ps_cu_node->ps_parent = ps_ctxt->ps_ipe_cu_tree;
721
70.3k
    for(i = 0; i < 4; i++)
722
56.2k
    {
723
56.2k
        ps_cu_node->ps_sub_cu[i] = ps_ctxt->ps_ipe_cu_tree + 1 + i;
724
56.2k
    }
725
726
    /* Loop for all 8x8 block in a CTB */
727
14.0k
    ps_ctb_out->u4_cu_split_flags = 0x1;
728
729
    /* Initialize intra 64x64, 32x32 and 16x16 costs to max value */
730
70.3k
    for(i = 0; i < (MAX_CU_IN_CTB >> 4); i++)
731
56.2k
    {
732
56.2k
        ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i] = MAX_INTRA_COST_IPE;
733
56.2k
    }
734
735
239k
    for(i = 0; i < (MAX_CU_IN_CTB >> 2); i++)
736
224k
    {
737
224k
        ps_l0_ipe_out_ctb->ai4_best16x16_intra_cost[i] = MAX_INTRA_COST_IPE;
738
224k
    }
739
740
914k
    for(i = 0; i < (MAX_CU_IN_CTB); i++)
741
899k
    {
742
899k
        ps_l0_ipe_out_ctb->ai4_best8x8_intra_cost[i] = MAX_INTRA_COST_IPE;
743
899k
    }
744
745
14.0k
    ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = MAX_INTRA_COST_IPE;
746
747
    /* by default 64x64 modes are set to default values DC and Planar */
748
14.0k
    ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[0] = 0;
749
14.0k
    ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[1] = 1;
750
14.0k
    ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[2] = 255;
751
752
    /* by default 64x4 split is set to 1 */
753
14.0k
    ps_l0_ipe_out_ctb->u1_split_flag = 1;
754
755
    /* Modulation factor calculated based on spatial variance instead of hardcoded val*/
756
14.0k
    i4_mod_factor_num = ps_ctxt->ai4_mod_factor_derived_by_variance[1];  //16;
757
758
14.0k
    f_strength = ps_ctxt->f_strength;
759
760
    /* ------------------------------------------------ */
761
    /* populate the early decisions done by L1 analysis */
762
    /* ------------------------------------------------ */
763
239k
    for(i = 0; i < (MAX_CU_IN_CTB >> 2); i++)
764
224k
    {
765
224k
        ps_l0_ipe_out_ctb->ai4_best_sad_8x8_l1_ipe[i] = ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i];
766
224k
        ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_ipe[i] = ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i];
767
224k
        ps_l0_ipe_out_ctb->ai4_best_sad_8x8_l1_me[i] = ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i];
768
224k
        ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_me[i] = ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i];
769
224k
    }
770
771
    /* Init CTB level accumalated SATD and MPM bits */
772
14.0k
    ps_l0_ipe_out_ctb->i4_ctb_acc_satd = 0;
773
14.0k
    ps_l0_ipe_out_ctb->i4_ctb_acc_mpm_bits = 0;
774
775
    /* ------------------------------------------------ */
776
    /* Loop over all the blocks in current CTB          */
777
    /* ------------------------------------------------ */
778
14.0k
    {
779
        /* 64 8x8 blocks should be encountered for the do,while loop to exit */
780
14.0k
        do
781
304k
        {
782
304k
            intra32_analyse_t *ps_intra32_analyse;
783
304k
            intra16_analyse_t *ps_intra16_analyse;
784
304k
            WORD32 *pi4_intra_32_cost;
785
304k
            WORD32 *pi4_intra_16_cost;
786
304k
            WORD32 *pi4_intra_8_cost;
787
304k
            WORD32 merge_16x16_l1;
788
789
            /* Given the blk_cnt, get the CU's top-left 8x8 block's x and y positions within the CTB */
790
304k
            cu_pos_x = gau1_cu_pos_x[blk_cnt];
791
304k
            cu_pos_y = gau1_cu_pos_y[blk_cnt];
792
793
            /* default value for 32x32 best mode - blk_cnt increases by 16 for each 32x32 */
794
304k
            au1_best_32x32_modes[blk_cnt >> 4] = 255;
795
796
            /* get the corresponding intra 32 analyse pointer  use (blk_cnt / 16) */
797
            /* blk cnt is in terms of 8x8 units so a 32x32 will have 16 8x8 units */
798
304k
            ps_intra32_analyse = &ps_l0_ipe_out_ctb->as_intra32_analyse[blk_cnt >> 4];
799
800
            /* get the corresponding intra 16 analyse pointer use (blk_cnt & 0xF / 4)*/
801
            /* blk cnt is in terms of 8x8 units so a 16x16 will have 4 8x8 units */
802
304k
            ps_intra16_analyse = &ps_intra32_analyse->as_intra16_analyse[(blk_cnt & 0xF) >> 2];
803
804
            /* Line below assumes min_cu_size of 8 - checks whether CU starts are within picture */
805
304k
            if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
806
117k
            {
807
                /* Reset to zero for every cu decision */
808
117k
                merge_32x32_l1 = 0;
809
810
117k
                child_cost_least = 0;
811
812
                /* At L2, each 4x4 corresponds to 16x16 at L0. Every 4 16x16 stores a merge_success flag */
813
117k
                ps_ed_blk_l2 = ps_ed_l2_ctb + (blk_cnt >> 2);
814
815
117k
                pi4_intra_32_cost = &ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[blk_cnt >> 4];
816
817
                /* by default 32x32 modes are set to default values DC and Planar */
818
117k
                ps_intra32_analyse->au1_best_modes_32x32_tu[0] = 0;
819
117k
                ps_intra32_analyse->au1_best_modes_32x32_tu[1] = 1;
820
117k
                ps_intra32_analyse->au1_best_modes_32x32_tu[2] = 255;
821
822
                /* By default 32x32 split is set to 1 */
823
117k
                ps_intra32_analyse->b1_split_flag = 1;
824
825
117k
                ps_intra32_analyse->au1_best_modes_16x16_tu[0] = 0;
826
117k
                ps_intra32_analyse->au1_best_modes_16x16_tu[1] = 1;
827
117k
                ps_intra32_analyse->au1_best_modes_16x16_tu[2] = 255;
828
829
                /* 16x16 cost & 8x8 cost are stored in Raster scan order */
830
                /* stride of 16x16 buffer is MAX_CU_IN_CTB_ROW >> 1      */
831
                /* stride of 8x8 buffer is MAX_CU_IN_CTB_ROW             */
832
117k
                {
833
117k
                    WORD32 pos_x_8x8, pos_y_8x8;
834
835
117k
                    pos_x_8x8 = gau1_cu_pos_x[blk_cnt];
836
117k
                    pos_y_8x8 = gau1_cu_pos_y[blk_cnt];
837
838
117k
                    pi4_intra_16_cost = &ps_l0_ipe_out_ctb->ai4_best16x16_intra_cost[0];
839
840
117k
                    pi4_intra_16_cost +=
841
117k
                        ((pos_x_8x8 >> 1) + ((pos_y_8x8 >> 1) * (MAX_CU_IN_CTB_ROW >> 1)));
842
843
117k
                    pi4_intra_8_cost = &ps_l0_ipe_out_ctb->ai4_best8x8_intra_cost[0];
844
845
117k
                    pi4_intra_8_cost += (pos_x_8x8 + (pos_y_8x8 * MAX_CU_IN_CTB_ROW));
846
117k
                }
847
848
117k
                merge_32x32_l1 = 0;
849
117k
                merge_32x32_l2 = 0;
850
117k
                i4_skip_intra_eval_32x32_l1 = 0;
851
852
                /* Enable 16x16 merge iff sufficient 8x8 blocks remain in the current CTB */
853
117k
                merge_16x16_l1 = 0;
854
117k
                if(((num_8x8_blks_x - cu_pos_x) >= 2) && ((num_8x8_blks_y - cu_pos_y) >= 2))
855
106k
                {
856
106k
#if !ENABLE_UNIFORM_CU_SIZE_8x8
857
106k
                    merge_16x16_l1 = ps_ed_blk_l1->merge_success;
858
#else
859
                    merge_16x16_l1 = 0;
860
#endif
861
106k
                }
862
863
                /* Enable 32x32 merge iff sufficient 8x8 blocks remain in the current CTB */
864
117k
                if(((num_8x8_blks_x - cu_pos_x) >= 4) && ((num_8x8_blks_y - cu_pos_y) >= 4))
865
70.2k
                {
866
                    /* Check 4 flags of L1(8x8) say merge */
867
351k
                    for(i = 0; i < 4; i++)
868
280k
                    {
869
280k
                        merge_32x32_l1 += (ps_ed_blk_l1 + (i * 4))->merge_success;
870
871
                        //EIDD: num 16x16 blocks for which inter_intra flag says eval only inter, i.e. skip intra eval
872
280k
                        i4_skip_intra_eval_32x32_l1 +=
873
280k
                            ((ps_ed_blk_l1 + (i * 4))->intra_or_inter == 2) ? 1 : 0;
874
280k
                    }
875
876
70.2k
#if !ENABLE_UNIFORM_CU_SIZE_8x8
877
                    /* Check 1 flag from L2(16x16) say merge */
878
70.2k
                    merge_32x32_l2 = ps_ed_blk_l2->merge_success;
879
#else
880
                    merge_32x32_l1 = 0;
881
                    merge_32x32_l2 = 0;
882
#endif
883
70.2k
                }
884
885
117k
#if DISABLE_L2_IPE_IN_PB_L1_IN_B
886
117k
                if((i4_quality_preset == IHEVCE_QUALITY_P6) && (ps_ctxt->i4_slice_type != ISLICE))
887
4.17k
                {
888
4.17k
                    merge_32x32_l2 = 0;
889
4.17k
                    ps_ed_blk_l2->merge_success = 0;
890
4.17k
                }
891
117k
#endif
892
893
117k
                ps_intra32_analyse->b1_valid_cu = 1;
894
895
                /* If Merge success from all 4 L1 and L2, max CU size 32x32 is chosen */
896
                /* EIID: if all blocks to be skipped then skip entire 32x32 for intra eval,
897
                if no blocks to be skipped then eval entire 32x32,
898
                else break the merge and go to 16x16 level eval */
899
117k
                if((merge_32x32_l1 == 4) && merge_32x32_l2 &&
900
20.6k
                   ((i4_skip_intra_eval_32x32_l1 == 0) ||
901
1.09k
                    (i4_skip_intra_eval_32x32_l1 == 4))  //comment this line to disable break-merge
902
117k
                )
903
20.2k
                {
904
#if IP_DBG_L1_l2
905
                    /* Populate params for 32x32 block analysis */
906
                    ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
907
908
                    ps_cu_node->ps_parent->u1_cu_size = 32;
909
                    ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[blk_cnt]; /* Populate properly */
910
                    ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[blk_cnt]; /* Populate properly */
911
                    ps_cu_node->ps_parent->best_mode = ps_ed_blk_l2->best_merge_mode;
912
                    /* CU size 32x32 and fill the final cu params */
913
914
                    ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
915
916
                    /* Increment pointers */
917
                    ps_ed_blk_l1 += 16;
918
                    blk_cnt += 16;
919
                    ps_row_cu++;
920
                    merge_64x64 &= 1;
921
#else
922
923
                    /* EIID: dont evaluate if all 4 blocks at L1 said inter is winning*/
924
20.2k
                    if(4 == i4_skip_intra_eval_32x32_l1 && (ps_ctxt->i4_slice_type != ISLICE))
925
661
                    {
926
661
                        WORD32 i4_local_ctr1, i4_local_ctr2;
927
928
661
                        ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
929
930
661
                        ps_cu_node->ps_parent->u1_cu_size = 32;
931
661
                        ps_cu_node->ps_parent->u2_x0 =
932
661
                            gau1_cu_pos_x[blk_cnt]; /* Populate properly */
933
661
                        ps_cu_node->ps_parent->u2_y0 =
934
661
                            gau1_cu_pos_y[blk_cnt]; /* Populate properly */
935
661
                        ps_cu_node->ps_parent->best_mode =
936
661
                            INTRA_DC;  //ps_ed_blk_l2->best_merge_mode;
937
                        /* CU size 32x32 and fill the final cu params */
938
939
                        /* fill in the first modes as invalid */
940
661
                        ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
941
661
                        ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
942
661
                            INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
943
661
                        ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
944
945
661
                        ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
946
661
                        ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
947
661
                        ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
948
949
661
                        ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
950
951
                        //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
952
                        //ps_row_cu->u1_num_intra_rdopt_cands = 0;
953
954
661
                        ps_intra32_analyse->b1_valid_cu = 0;
955
661
                        ps_intra32_analyse->b1_split_flag = 0;
956
661
                        ps_intra32_analyse->b1_merge_flag = 0;
957
                        /*memset (&ps_intra32_analyse->au1_best_modes_32x32_tu,
958
                        255,
959
                        NUM_BEST_MODES);
960
                        memset (&ps_intra32_analyse->au1_best_modes_16x16_tu,
961
                        255,
962
                        NUM_BEST_MODES);*/
963
                        //set only first mode since if it's 255. it wont go ahead
964
661
                        ps_intra32_analyse->au1_best_modes_32x32_tu[0] = 255;
965
661
                        ps_intra32_analyse->au1_best_modes_16x16_tu[0] = 255;
966
967
661
                        *pi4_intra_32_cost = MAX_INTRA_COST_IPE;
968
969
                        /*since ME will start evaluating from bottom up, set the lower
970
                        cu size data invalid */
971
3.30k
                        for(i4_local_ctr1 = 0; i4_local_ctr1 < 4; i4_local_ctr1++)
972
2.64k
                        {
973
2.64k
                            WORD32 *pi4_intra_8_cost_curr16;
974
975
2.64k
                            ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
976
2.64k
                                .au1_best_modes_16x16_tu[0] = 255;
977
2.64k
                            ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
978
2.64k
                                .au1_best_modes_8x8_tu[0] = 255;
979
2.64k
                            ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_merge_flag = 0;
980
2.64k
                            ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_valid_cu = 0;
981
2.64k
                            ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_split_flag = 0;
982
983
2.64k
                            pi4_intra_16_cost
984
2.64k
                                [(i4_local_ctr1 & 1) + ((MAX_CU_IN_CTB_ROW >> 1) *
985
2.64k
                                                        (i4_local_ctr1 >> 1))] = MAX_INTRA_COST_IPE;
986
987
2.64k
                            pi4_intra_8_cost_curr16 = pi4_intra_8_cost + ((i4_local_ctr1 & 1) << 1);
988
2.64k
                            pi4_intra_8_cost_curr16 +=
989
2.64k
                                ((i4_local_ctr1 >> 1) << 1) * MAX_CU_IN_CTB_ROW;
990
991
13.2k
                            for(i4_local_ctr2 = 0; i4_local_ctr2 < 4; i4_local_ctr2++)
992
10.5k
                            {
993
10.5k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
994
10.5k
                                    .as_intra8_analyse[i4_local_ctr2]
995
10.5k
                                    .au1_4x4_best_modes[0][0] = 255;
996
10.5k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
997
10.5k
                                    .as_intra8_analyse[i4_local_ctr2]
998
10.5k
                                    .au1_4x4_best_modes[1][0] = 255;
999
10.5k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1000
10.5k
                                    .as_intra8_analyse[i4_local_ctr2]
1001
10.5k
                                    .au1_4x4_best_modes[2][0] = 255;
1002
10.5k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1003
10.5k
                                    .as_intra8_analyse[i4_local_ctr2]
1004
10.5k
                                    .au1_4x4_best_modes[3][0] = 255;
1005
10.5k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1006
10.5k
                                    .as_intra8_analyse[i4_local_ctr2]
1007
10.5k
                                    .au1_best_modes_8x8_tu[0] = 255;
1008
10.5k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1009
10.5k
                                    .as_intra8_analyse[i4_local_ctr2]
1010
10.5k
                                    .au1_best_modes_4x4_tu[0] = 255;
1011
10.5k
                                ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
1012
10.5k
                                    .as_intra8_analyse[i4_local_ctr2]
1013
10.5k
                                    .b1_valid_cu = 0;
1014
1015
10.5k
                                pi4_intra_8_cost_curr16
1016
10.5k
                                    [(i4_local_ctr2 & 1) +
1017
10.5k
                                     (MAX_CU_IN_CTB_ROW * (i4_local_ctr2 >> 1))] =
1018
10.5k
                                        MAX_INTRA_COST_IPE;
1019
10.5k
                            }
1020
2.64k
                        }
1021
1022
                        /* set neighbours even if intra is not evaluated, since source is always available. */
1023
661
                        ihevce_set_nbr_map(
1024
661
                            ps_ctxt->pu1_ctb_nbr_map,
1025
661
                            ps_ctxt->i4_nbr_map_strd,
1026
661
                            ps_cu_node->ps_parent->u2_x0 << 1,
1027
661
                            ps_cu_node->ps_parent->u2_y0 << 1,
1028
661
                            (ps_cu_node->ps_parent->u1_cu_size >> 2),
1029
661
                            1);
1030
1031
                        /* cost accumalation of best cu size candiate */
1032
                        /*i8_frame_acc_satd_cost += parent_cost;*/
1033
1034
                        /* Mode bits cost accumalation for best cu size and cu mode */
1035
                        /*i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;*/
1036
1037
                        /*satd/mod_qp accumulation of best cu */
1038
                        /*i8_frame_acc_satd_by_modqp_q10 += ((LWORD64)ps_cu_node->ps_parent->best_satd << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3))/i4_q_scale_q3_mod;*/
1039
1040
                        /* Increment pointers */
1041
661
                        ps_ed_blk_l1 += 16;
1042
661
                        blk_cnt += 16;
1043
                        //ps_row_cu++;
1044
661
                        merge_64x64 = 0;
1045
1046
                        /* increment for stat purpose only. Increment is valid only on single thread */
1047
661
                        ps_ctxt->u4_num_16x16_skips_at_L0_IPE += 4;
1048
661
                    }
1049
19.5k
                    else
1050
19.5k
                    {
1051
                        /* Revaluation of 4 16x16 blocks at 8x8 prediction level */
1052
                        //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1053
1054
19.5k
                        if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
1055
72
                           (ps_ctxt->i4_slice_type == PSLICE))
1056
0
                        {
1057
0
                            ps_ctxt->u1_disable_child_cu_decide = 1;
1058
0
                            step2_bypass = 0;
1059
0
                        }
1060
1061
                        /* Based on the flag, Child modes decision can be disabled*/
1062
19.5k
                        if(0 == ps_ctxt->u1_disable_child_cu_decide)
1063
19.5k
                        {
1064
97.8k
                            for(j = 0; j < 4; j++)
1065
78.3k
                            {
1066
78.3k
                                ps_cu_node->ps_sub_cu[j]->u2_x0 =
1067
78.3k
                                    gau1_cu_pos_x[blk_cnt + (j * 4)]; /* Populate properly */
1068
78.3k
                                ps_cu_node->ps_sub_cu[j]->u2_y0 =
1069
78.3k
                                    gau1_cu_pos_y[blk_cnt + (j * 4)]; /* Populate properly */
1070
78.3k
                                ps_cu_node->ps_sub_cu[j]->u1_cu_size = 16;
1071
1072
78.3k
                                {
1073
78.3k
                                    WORD32 best_ang_mode =
1074
78.3k
                                        (ps_ed_blk_l1 + (j * 4))->best_merge_mode;
1075
1076
78.3k
                                    if(best_ang_mode < 2)
1077
69.1k
                                        best_ang_mode = 26;
1078
1079
78.3k
                                    ihevce_mode_eval_filtering(
1080
78.3k
                                        ps_cu_node->ps_sub_cu[j],
1081
78.3k
                                        ps_cu_node,
1082
78.3k
                                        ps_ctxt,
1083
78.3k
                                        ps_curr_src,
1084
78.3k
                                        best_ang_mode,
1085
78.3k
                                        &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1086
78.3k
                                        &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1087
78.3k
                                        !step2_bypass,
1088
78.3k
                                        1);
1089
1090
78.3k
                                    if(i4_enable_4cu_16tu)
1091
43.3k
                                    {
1092
43.3k
                                        ihevce_mode_eval_filtering(
1093
43.3k
                                            ps_cu_node->ps_sub_cu[j],
1094
43.3k
                                            ps_cu_node,
1095
43.3k
                                            ps_ctxt,
1096
43.3k
                                            ps_curr_src,
1097
43.3k
                                            best_ang_mode,
1098
43.3k
                                            &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1099
43.3k
                                            &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1100
43.3k
                                            !step2_bypass,
1101
43.3k
                                            0);
1102
43.3k
                                    }
1103
34.9k
                                    else
1104
34.9k
                                    {
1105
                                        /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1106
34.9k
                                        memcpy(
1107
34.9k
                                            &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1108
34.9k
                                            &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1109
34.9k
                                            NUM_BEST_MODES);
1110
1111
                                        /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1112
34.9k
                                        memcpy(
1113
34.9k
                                            &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1114
34.9k
                                            &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1115
34.9k
                                            NUM_BEST_MODES * sizeof(WORD32));
1116
34.9k
                                    }
1117
1118
78.3k
                                    child_cost[j] =
1119
78.3k
                                        MIN(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1120
78.3k
                                            ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0]);
1121
1122
                                    /* Child cost is sum of costs at 16x16 level  */
1123
78.3k
                                    child_cost_least += child_cost[j];
1124
1125
                                    /* Select the best mode to be populated as top and left nbr depending on the
1126
                                    4tu and 1tu cost */
1127
78.3k
                                    if(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0] >
1128
78.3k
                                       ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0])
1129
3.11k
                                    {
1130
3.11k
                                        ps_cu_node->ps_sub_cu[j]->best_mode =
1131
3.11k
                                            ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
1132
3.11k
                                    }
1133
75.1k
                                    else
1134
75.1k
                                    {
1135
75.1k
                                        ps_cu_node->ps_sub_cu[j]->best_mode =
1136
75.1k
                                            ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
1137
75.1k
                                    }
1138
1139
78.3k
                                    { /* Update the CTB nodes only for MAX - 1 CU nodes */
1140
78.3k
                                        WORD32 xA, yA, row, col;
1141
78.3k
                                        xA = ((ps_cu_node->ps_sub_cu[j]->u2_x0 << 3) >> 2) + 1;
1142
78.3k
                                        yA = ((ps_cu_node->ps_sub_cu[j]->u2_y0 << 3) >> 2) + 1;
1143
78.3k
                                        size = ps_cu_node->ps_sub_cu[j]->u1_cu_size >> 2;
1144
391k
                                        for(row = yA; row < (yA + size); row++)
1145
313k
                                        {
1146
1.56M
                                            for(col = xA; col < (xA + size); col++)
1147
1.25M
                                            {
1148
1.25M
                                                ps_ctxt->au1_ctb_mode_map[row][col] =
1149
1.25M
                                                    ps_cu_node->ps_sub_cu[j]->best_mode;
1150
1.25M
                                            }
1151
313k
                                        }
1152
78.3k
                                    }
1153
78.3k
                                }
1154
1155
                                /*Child SATD cost*/
1156
78.3k
                                child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
1157
1158
                                /* store the child 16x16 costs */
1159
78.3k
                                pi4_intra_16_cost[(j & 1) + ((MAX_CU_IN_CTB_ROW >> 1) * (j >> 1))] =
1160
78.3k
                                    child_cost[j];
1161
1162
                                /* set the CU valid flag */
1163
78.3k
                                ps_intra16_analyse[j].b1_valid_cu = 1;
1164
1165
                                /* All 16x16 merge is valid, if Cu 32x32 is chosen */
1166
                                /* To be reset, if CU 64x64 is chosen */
1167
78.3k
                                ps_intra16_analyse[j].b1_merge_flag = 1;
1168
1169
                                /* storing the modes to intra 16 analyse */
1170
                                /* store the best 16x16 modes 8x8 tu */
1171
78.3k
                                memcpy(
1172
78.3k
                                    &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1173
78.3k
                                    &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1174
78.3k
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
1175
78.3k
                                ps_intra16_analyse[j].au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1176
1177
                                /* store the best 16x16 modes 16x16 tu */
1178
78.3k
                                memcpy(
1179
78.3k
                                    &ps_intra16_analyse[j].au1_best_modes_16x16_tu[0],
1180
78.3k
                                    &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1181
78.3k
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
1182
78.3k
                                ps_intra16_analyse[j].au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1183
1184
                                /* divide the 16x16 costs (pro rating) to 4 8x8 costs */
1185
                                /* store the same 16x16 modes as 4 8x8 child modes    */
1186
78.3k
                                {
1187
78.3k
                                    WORD32 idx_8x8;
1188
78.3k
                                    WORD32 *pi4_intra_8_cost_curr16;
1189
78.3k
                                    intra8_analyse_t *ps_intra8_analyse;
1190
1191
78.3k
                                    pi4_intra_8_cost_curr16 = pi4_intra_8_cost + ((j & 1) << 1);
1192
78.3k
                                    pi4_intra_8_cost_curr16 += ((j >> 1) << 1) * MAX_CU_IN_CTB_ROW;
1193
1194
391k
                                    for(idx_8x8 = 0; idx_8x8 < 4; idx_8x8++)
1195
313k
                                    {
1196
313k
                                        pi4_intra_8_cost_curr16
1197
313k
                                            [(idx_8x8 & 1) + (MAX_CU_IN_CTB_ROW * (idx_8x8 >> 1))] =
1198
313k
                                                (child_cost[j] + 3) >> 2;
1199
1200
313k
                                        ps_intra8_analyse =
1201
313k
                                            &ps_intra16_analyse[j].as_intra8_analyse[idx_8x8];
1202
1203
313k
                                        ps_intra8_analyse->b1_enable_nxn = 0;
1204
313k
                                        ps_intra8_analyse->b1_valid_cu = 1;
1205
1206
                                        /* store the best 8x8 modes 8x8 tu */
1207
313k
                                        memcpy(
1208
313k
                                            &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
1209
313k
                                            &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1210
313k
                                            sizeof(UWORD8) * (NUM_BEST_MODES + 1));
1211
1212
                                        /* store the best 8x8 modes 4x4 tu */
1213
313k
                                        memcpy(
1214
313k
                                            &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
1215
313k
                                            &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
1216
313k
                                            sizeof(UWORD8) * (NUM_BEST_MODES + 1));
1217
1218
                                        /* NXN modes not evaluated hence set to 0 */
1219
313k
                                        memset(
1220
313k
                                            &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1221
313k
                                            255,
1222
313k
                                            sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1223
313k
                                    }
1224
78.3k
                                }
1225
78.3k
                            }
1226
1227
19.5k
                            ihevce_set_nbr_map(
1228
19.5k
                                ps_ctxt->pu1_ctb_nbr_map,
1229
19.5k
                                ps_ctxt->i4_nbr_map_strd,
1230
19.5k
                                ps_cu_node->ps_sub_cu[0]->u2_x0 << 1,
1231
19.5k
                                ps_cu_node->ps_sub_cu[0]->u2_y0 << 1,
1232
19.5k
                                (ps_cu_node->ps_sub_cu[0]->u1_cu_size >> 1),
1233
19.5k
                                0);
1234
19.5k
                        }
1235
0
#if 1  //DISBLE_CHILD_CU_EVAL_L0_IPE //1
1236
0
                        else
1237
0
                        {
1238
0
                            for(j = 0; j < 4; j++)
1239
0
                            {
1240
0
                                WORD32 idx_8x8;
1241
0
                                intra8_analyse_t *ps_intra8_analyse;
1242
0
                                ps_intra16_analyse[j].au1_best_modes_8x8_tu[0] = 255;
1243
0
                                ps_intra16_analyse[j].au1_best_modes_16x16_tu[0] = 255;
1244
1245
0
                                ps_intra16_analyse[j].b1_valid_cu = 0;
1246
1247
0
                                for(idx_8x8 = 0; idx_8x8 < 4; idx_8x8++)
1248
0
                                {
1249
0
                                    ps_intra8_analyse =
1250
0
                                        &ps_intra16_analyse[j].as_intra8_analyse[idx_8x8];
1251
1252
0
                                    ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
1253
0
                                    ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
1254
1255
0
                                    ps_intra8_analyse->b1_enable_nxn = 0;
1256
0
                                    ps_intra8_analyse->b1_valid_cu = 0;
1257
1258
                                    /* NXN modes not evaluated hence set to 0 */
1259
0
                                    memset(
1260
0
                                        &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1261
0
                                        255,
1262
0
                                        sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1263
0
                                }
1264
0
                            }
1265
1266
0
                            child_cost_least = MAX_INTRA_COST_IPE;
1267
0
                        }
1268
19.5k
#endif
1269
1270
                        /* Populate params for 32x32 block analysis */
1271
1272
19.5k
                        ps_cu_node->ps_parent->u1_cu_size = 32;
1273
19.5k
                        ps_cu_node->ps_parent->u2_x0 =
1274
19.5k
                            gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1275
19.5k
                        ps_cu_node->ps_parent->u2_y0 =
1276
19.5k
                            gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1277
1278
                        /* Revaluation for 32x32 parent block at 16x16 prediction level */
1279
                        //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1280
1281
19.5k
                        {
1282
                            /* Eval for TUSize = CuSize */
1283
19.5k
                            ihevce_mode_eval_filtering(
1284
19.5k
                                ps_cu_node->ps_parent,
1285
19.5k
                                ps_cu_node,
1286
19.5k
                                ps_ctxt,
1287
19.5k
                                ps_curr_src,
1288
19.5k
                                26,
1289
19.5k
                                &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1290
19.5k
                                &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1291
19.5k
                                step2_bypass,
1292
19.5k
                                1);
1293
1294
19.5k
                            if(i4_enable_1cu_4tu)
1295
10.8k
                            {
1296
                                /* Eval for TUSize = CuSize/2 */
1297
10.8k
                                ihevce_mode_eval_filtering(
1298
10.8k
                                    ps_cu_node->ps_parent,
1299
10.8k
                                    ps_cu_node,
1300
10.8k
                                    ps_ctxt,
1301
10.8k
                                    ps_curr_src,
1302
10.8k
                                    26,
1303
10.8k
                                    &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1304
10.8k
                                    &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1305
10.8k
                                    step2_bypass,
1306
10.8k
                                    0);
1307
10.8k
                            }
1308
8.74k
                            else
1309
8.74k
                            {
1310
                                /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1311
8.74k
                                memcpy(
1312
8.74k
                                    &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1313
8.74k
                                    &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1314
8.74k
                                    NUM_BEST_MODES);
1315
1316
                                /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1317
8.74k
                                memcpy(
1318
8.74k
                                    &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1319
8.74k
                                    &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1320
8.74k
                                    NUM_BEST_MODES * sizeof(WORD32));
1321
8.74k
                            }
1322
19.5k
                        }
1323
1324
19.5k
                        ps_ctxt->u1_disable_child_cu_decide = 0;
1325
19.5k
                        step2_bypass = 1;
1326
1327
                        /* Update parent cost */
1328
19.5k
                        parent_cost =
1329
19.5k
                            MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1330
19.5k
                                ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
1331
1332
                        /* Select the best mode to be populated as top and left nbr depending on the
1333
                        4tu and 1tu cost */
1334
19.5k
                        if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
1335
19.5k
                           ps_cu_node->ps_parent->au4_best_cost_1tu[0])
1336
970
                        {
1337
970
                            ps_cu_node->ps_parent->best_mode =
1338
970
                                ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1339
970
                        }
1340
18.6k
                        else
1341
18.6k
                        {
1342
18.6k
                            ps_cu_node->ps_parent->best_mode =
1343
18.6k
                                ps_cu_node->ps_parent->au1_best_mode_4tu[0];
1344
18.6k
                        }
1345
1346
                        /* store the 32x32 cost */
1347
19.5k
                        *pi4_intra_32_cost = parent_cost;
1348
1349
                        /* set the CU valid flag */
1350
19.5k
                        ps_intra32_analyse->b1_valid_cu = 1;
1351
1352
19.5k
                        ps_intra32_analyse->b1_merge_flag = 1;
1353
1354
                        /* storing the modes to intra 32 analyse */
1355
19.5k
                        {
1356
                            /* store the best 32x32 modes 16x16 tu */
1357
19.5k
                            memcpy(
1358
19.5k
                                &ps_intra32_analyse->au1_best_modes_16x16_tu[0],
1359
19.5k
                                &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1360
19.5k
                                sizeof(UWORD8) * (NUM_BEST_MODES));
1361
19.5k
                            ps_intra32_analyse->au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1362
1363
                            /* store the best 32x32 modes 32x32 tu */
1364
19.5k
                            memcpy(
1365
19.5k
                                &ps_intra32_analyse->au1_best_modes_32x32_tu[0],
1366
19.5k
                                &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1367
19.5k
                                sizeof(UWORD8) * (NUM_BEST_MODES));
1368
19.5k
                            ps_intra32_analyse->au1_best_modes_32x32_tu[NUM_BEST_MODES] = 255;
1369
19.5k
                        }
1370
19.5k
                        parent_best_mode = ps_cu_node->ps_parent->best_mode;
1371
19.5k
                        if((parent_cost <=
1372
19.5k
                            child_cost_least + (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >>
1373
19.5k
                                                LAMBDA_Q_SHIFT)))  //|| identical_modes)
1374
16.5k
                        {
1375
16.5k
                            WORD32 i4_q_scale_q3_mod;
1376
16.5k
                            UWORD8 u1_cu_possible_qp;
1377
16.5k
                            WORD32 i4_act_factor;
1378
1379
                            /* CU size 32x32 and fill the final cu params */
1380
1381
16.5k
                            ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1382
1383
16.5k
                            if((IHEVCE_QUALITY_P3 > i4_quality_preset))
1384
9.32k
                            {
1385
46.6k
                                for(i = 0; i < 4; i++)
1386
37.2k
                                {
1387
37.2k
                                    intra8_analyse_t *ps_intra8_analyse;
1388
37.2k
                                    ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
1389
186k
                                    for(j = 0; j < 4; j++)
1390
149k
                                    {
1391
                                        /* Populate best 3 nxn modes */
1392
149k
                                        ps_intra8_analyse->au1_4x4_best_modes[j][0] =
1393
149k
                                            ps_cu_node->ps_sub_cu[i]->au1_best_mode_4tu[0];
1394
149k
                                        ps_intra8_analyse->au1_4x4_best_modes[j][1] =
1395
149k
                                            ps_cu_node->ps_sub_cu[i]
1396
149k
                                                ->au1_best_mode_4tu[1];  //(ps_ed + 1)->best_mode;
1397
149k
                                        ps_intra8_analyse->au1_4x4_best_modes[j][2] =
1398
149k
                                            ps_cu_node->ps_sub_cu[i]
1399
149k
                                                ->au1_best_mode_4tu[2];  //(ps_ed + 2)->best_mode;
1400
149k
                                        ps_intra8_analyse->au1_4x4_best_modes[j][3] = 255;
1401
149k
                                    }
1402
37.2k
                                }
1403
9.32k
                            }
1404
                            /* store the 32x32 non split flag */
1405
16.5k
                            ps_intra32_analyse->b1_split_flag = 0;
1406
16.5k
                            ps_intra32_analyse->as_intra16_analyse[0].b1_split_flag = 0;
1407
16.5k
                            ps_intra32_analyse->as_intra16_analyse[1].b1_split_flag = 0;
1408
16.5k
                            ps_intra32_analyse->as_intra16_analyse[2].b1_split_flag = 0;
1409
16.5k
                            ps_intra32_analyse->as_intra16_analyse[3].b1_split_flag = 0;
1410
1411
16.5k
                            au1_best_32x32_modes[blk_cnt >> 4] =
1412
16.5k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1413
1414
16.5k
                            au4_best_32x32_cost[blk_cnt >> 4] =
1415
16.5k
                                ps_cu_node->ps_parent->au4_best_cost_1tu[0];
1416
                            /*As 32*32 has won, pick L2 8x8 qp which maps
1417
                            to L0 32x32 Qp*/
1418
16.5k
                            ASSERT(((blk_cnt >> 4) & 3) == (blk_cnt >> 4));
1419
16.5k
                            ASSERT(ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0] != -2);
1420
16.5k
                            u1_cu_possible_qp = ihevce_cu_level_qp_mod(
1421
16.5k
                                ps_ctxt->i4_qscale,
1422
16.5k
                                ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0],
1423
16.5k
                                ps_ctxt->ld_curr_frame_16x16_log_avg[0],
1424
16.5k
                                f_strength,
1425
16.5k
                                &i4_act_factor,
1426
16.5k
                                &i4_q_scale_q3_mod,
1427
16.5k
                                ps_ctxt->ps_rc_quant_ctxt);
1428
                            /* cost accumalation of best cu size candiate */
1429
16.5k
                            i8_frame_acc_satd_cost += parent_cost;
1430
1431
                            /* satd and mpm bits accumalation of best cu size candiate */
1432
16.5k
                            i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
1433
1434
                            /* Mode bits cost accumalation for best cu size and cu mode */
1435
16.5k
                            i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;
1436
1437
                            /*satd/mod_qp accumulation of best cu */
1438
16.5k
                            i8_frame_acc_satd_by_modqp_q10 +=
1439
16.5k
                                ((LWORD64)ps_cu_node->ps_parent->best_satd
1440
16.5k
                                 << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1441
16.5k
                                i4_q_scale_q3_mod;
1442
1443
                            /* Increment pointers */
1444
16.5k
                            ps_ed_blk_l1 += 16;
1445
16.5k
                            blk_cnt += 16;
1446
                            //ps_row_cu++;
1447
16.5k
                            merge_64x64 &= 1;
1448
16.5k
                        }
1449
2.97k
                        else
1450
2.97k
                        {
1451
                            /* store the 32x32 split flag */
1452
2.97k
                            ps_intra32_analyse->b1_split_flag = 1;
1453
1454
                            /* CU size 16x16 and fill the final cu params for all 4 blocks */
1455
14.8k
                            for(j = 0; j < 4; j++)
1456
11.9k
                            {
1457
11.9k
                                WORD32 i4_q_scale_q3_mod;
1458
11.9k
                                UWORD8 u1_cu_possible_qp;
1459
11.9k
                                WORD32 i4_act_factor;
1460
1461
                                /* Set CU split flag */
1462
11.9k
                                ASSERT(blk_cnt % 4 == 0);
1463
1464
11.9k
                                ihevce_update_cand_list(
1465
11.9k
                                    ps_cu_node->ps_sub_cu[j], ps_ed_blk_l1, ps_ctxt);
1466
1467
                                /* store the 16x16 non split flag  */
1468
11.9k
                                ps_intra16_analyse[j].b1_split_flag = 0;
1469
1470
11.9k
                                ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1471
11.9k
                                ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0] != -2);
1472
                                /*As 16*16 has won, pick L1 8x8 qp which maps
1473
                                to L0 16x16 Qp*/
1474
11.9k
                                u1_cu_possible_qp = ihevce_cu_level_qp_mod(
1475
11.9k
                                    ps_ctxt->i4_qscale,
1476
11.9k
                                    ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0],
1477
11.9k
                                    ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1478
11.9k
                                    f_strength,
1479
11.9k
                                    &i4_act_factor,
1480
11.9k
                                    &i4_q_scale_q3_mod,
1481
11.9k
                                    ps_ctxt->ps_rc_quant_ctxt);
1482
1483
                                /*accum satd/qp for all child block*/
1484
11.9k
                                i8_frame_acc_satd_by_modqp_q10 +=
1485
11.9k
                                    ((LWORD64)child_satd[j]
1486
11.9k
                                     << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1487
11.9k
                                    i4_q_scale_q3_mod;
1488
1489
                                /* Accumalate mode bits for all child blocks */
1490
11.9k
                                i8_frame_acc_mode_bits_cost +=
1491
11.9k
                                    ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
1492
1493
                                /* satd and mpm bits accumalation of best cu size candiate */
1494
11.9k
                                i4_ctb_acc_satd += child_satd[j];
1495
1496
                                /* Increment pointers */
1497
                                //ps_row_cu++;
1498
11.9k
                                ps_ed_blk_l1 += 4;
1499
11.9k
                                blk_cnt += 4;
1500
11.9k
                            }
1501
1502
                            /* cost accumalation of best cu size candiate */
1503
2.97k
                            i8_frame_acc_satd_cost += child_cost_least;
1504
1505
                            /* 64x64 merge is not possible */
1506
2.97k
                            merge_64x64 = 0;
1507
2.97k
                        }
1508
1509
                        //ps_ed_blk_l2 += 4;
1510
1511
19.5k
                    }  //end of EIID's else
1512
20.2k
#endif
1513
20.2k
                }
1514
                /* If Merge success for L1 max CU size 16x16 is chosen */
1515
97.3k
                else if(merge_16x16_l1)
1516
41.8k
                {
1517
#if IP_DBG_L1_l2
1518
                    ps_cu_node->ps_parent->u1_cu_size = 16;
1519
                    ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1520
                    ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1521
                    ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_merge_mode;
1522
                    ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1523
1524
                    blk_cnt += 4;
1525
                    ps_ed_blk_l1 += 4;
1526
                    ps_row_cu++;
1527
                    merge_64x64 = 0;
1528
#else
1529
1530
                    /*EIID: evaluate only if L1 early-inter-intra decision is not favouring inter*/
1531
                    /* enable this only in B pictures */
1532
41.8k
                    if(ps_ed_blk_l1->intra_or_inter == 2 && (ps_ctxt->i4_slice_type != ISLICE))
1533
2.28k
                    {
1534
2.28k
                        WORD32 i4_q_scale_q3_mod, i4_local_ctr;
1535
2.28k
                        WORD8 i1_cu_possible_qp;
1536
2.28k
                        WORD32 i4_act_factor;
1537
                        /* make cost infinity. */
1538
                        /* make modes invalid */
1539
                        /* update loop variables */
1540
                        /* set other output variales */
1541
                        /* dont set neighbour flag so that next blocks wont access this cu */
1542
                        /* what happens to ctb_mode_map?? */
1543
1544
2.28k
                        ps_cu_node->ps_parent->u1_cu_size = 16;
1545
2.28k
                        ps_cu_node->ps_parent->u2_x0 =
1546
2.28k
                            gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1547
2.28k
                        ps_cu_node->ps_parent->u2_y0 =
1548
2.28k
                            gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1549
2.28k
                        ps_cu_node->ps_parent->best_mode =
1550
2.28k
                            INTRA_DC;  //ps_ed_blk_l1->best_merge_mode;
1551
1552
                        /* fill in the first modes as invalid */
1553
1554
2.28k
                        ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
1555
2.28k
                        ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
1556
2.28k
                            INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
1557
2.28k
                        ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
1558
1559
2.28k
                        ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
1560
2.28k
                        ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
1561
2.28k
                        ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
1562
1563
2.28k
                        ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1564
1565
                        //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
1566
                        //ps_row_cu->u1_num_intra_rdopt_cands = 0;
1567
1568
2.28k
                        ps_intra32_analyse->b1_split_flag = 1;
1569
2.28k
                        ps_intra32_analyse->b1_merge_flag = 0;
1570
1571
2.28k
                        ps_intra16_analyse->b1_valid_cu = 0;
1572
2.28k
                        ps_intra16_analyse->b1_split_flag = 0;
1573
2.28k
                        ps_intra16_analyse->b1_merge_flag = 1;
1574
                        //memset (&ps_intra16_analyse->au1_best_modes_16x16_tu,
1575
                        //  255,
1576
                        //  NUM_BEST_MODES);
1577
                        //memset (&ps_intra16_analyse->au1_best_modes_8x8_tu,
1578
                        //  255,
1579
                        //  NUM_BEST_MODES);
1580
                        //set only first mode since if it's 255. it wont go ahead
1581
2.28k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 255;
1582
2.28k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 255;
1583
2.28k
                        *pi4_intra_16_cost = MAX_INTRA_COST_IPE;
1584
1585
                        /*since ME will start evaluating from bottom up, set the lower
1586
                        cu size data invalid */
1587
11.4k
                        for(i4_local_ctr = 0; i4_local_ctr < 4; i4_local_ctr++)
1588
9.12k
                        {
1589
9.12k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1590
9.12k
                                .au1_4x4_best_modes[0][0] = 255;
1591
9.12k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1592
9.12k
                                .au1_4x4_best_modes[1][0] = 255;
1593
9.12k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1594
9.12k
                                .au1_4x4_best_modes[2][0] = 255;
1595
9.12k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1596
9.12k
                                .au1_4x4_best_modes[3][0] = 255;
1597
9.12k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1598
9.12k
                                .au1_best_modes_8x8_tu[0] = 255;
1599
9.12k
                            ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
1600
9.12k
                                .au1_best_modes_4x4_tu[0] = 255;
1601
1602
9.12k
                            pi4_intra_8_cost
1603
9.12k
                                [(i4_local_ctr & 1) + (MAX_CU_IN_CTB_ROW * (i4_local_ctr >> 1))] =
1604
9.12k
                                    MAX_INTRA_COST_IPE;
1605
9.12k
                        }
1606
1607
                        /* set neighbours even if intra is not evaluated, since source is always available. */
1608
2.28k
                        ihevce_set_nbr_map(
1609
2.28k
                            ps_ctxt->pu1_ctb_nbr_map,
1610
2.28k
                            ps_ctxt->i4_nbr_map_strd,
1611
2.28k
                            ps_cu_node->ps_parent->u2_x0 << 1,
1612
2.28k
                            ps_cu_node->ps_parent->u2_y0 << 1,
1613
2.28k
                            (ps_cu_node->ps_parent->u1_cu_size >> 2),
1614
2.28k
                            1);
1615
1616
                        //what happends to RC variables??
1617
                        /* run only constant Qp */
1618
2.28k
                        ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1619
2.28k
                        ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0] != -2);
1620
2.28k
                        i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1621
2.28k
                            ps_ctxt->i4_qscale,
1622
2.28k
                            ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0],
1623
2.28k
                            ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1624
2.28k
                            f_strength,
1625
2.28k
                            &i4_act_factor,
1626
2.28k
                            &i4_q_scale_q3_mod,
1627
2.28k
                            ps_ctxt->ps_rc_quant_ctxt);
1628
1629
                        /* cost accumalation of best cu size candiate */
1630
2.28k
                        i8_frame_acc_satd_cost += 0;  //parent_cost;  //incorrect accumulation
1631
1632
                        /*satd/mod_qp accumulation of best cu */
1633
2.28k
                        i8_frame_acc_satd_by_modqp_q10 += 0;  //incorrect accumulation
1634
                        //((LWORD64)ps_cu_node->ps_parent->best_satd << SATD_BY_ACT_Q_FAC)/i4_q_scale_q3_mod;
1635
1636
                        /* Accumalate mode bits for all child blocks */
1637
2.28k
                        i8_frame_acc_mode_bits_cost +=
1638
2.28k
                            0;  //ps_cu_node->ps_parent->u2_mode_bits_cost;
1639
                        //incoorect accumulation
1640
1641
2.28k
                        blk_cnt += 4;
1642
2.28k
                        ps_ed_blk_l1 += 4;
1643
                        //ps_row_cu++;
1644
2.28k
                        merge_64x64 = 0;
1645
1646
                        /* increment for stat purpose only. Increment is valid only on single thread */
1647
2.28k
                        ps_ctxt->u4_num_16x16_skips_at_L0_IPE += 1;
1648
2.28k
                    }
1649
39.5k
                    else
1650
39.5k
                    {
1651
                        /* 64x64 merge is not possible */
1652
39.5k
                        merge_64x64 = 0;
1653
1654
                        /* set the 32x32 split flag to 1 */
1655
39.5k
                        ps_intra32_analyse->b1_split_flag = 1;
1656
1657
39.5k
                        ps_intra32_analyse->b1_merge_flag = 0;
1658
1659
39.5k
                        ps_intra16_analyse->b1_merge_flag = 1;
1660
1661
39.5k
                        if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
1662
2.18k
                           (ps_ctxt->i4_slice_type == PSLICE))
1663
2.08k
                        {
1664
2.08k
                            ps_ctxt->u1_disable_child_cu_decide = 1;
1665
2.08k
                            step2_bypass = 0;
1666
2.08k
                        }
1667
                        //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1668
                        /* Based on the flag, Child modes decision can be disabled*/
1669
39.5k
                        if(0 == ps_ctxt->u1_disable_child_cu_decide)
1670
37.4k
                        {
1671
187k
                            for(j = 0; j < 4; j++)
1672
149k
                            {
1673
149k
                                intra8_analyse_t *ps_intra8_analyse;
1674
149k
                                WORD32 best_ang_mode = (ps_ed_blk_l1 + j)->best_mode;
1675
1676
149k
                                if(best_ang_mode < 2)
1677
90.0k
                                    best_ang_mode = 26;
1678
1679
                                //ps_cu_node->ps_sub_cu[j]->best_cost = MAX_INTRA_COST_IPE;
1680
                                //ps_cu_node->ps_sub_cu[j]->best_mode = (ps_ed_blk_l1 + j)->best_mode;
1681
1682
149k
                                ps_cu_node->ps_sub_cu[j]->u2_x0 =
1683
149k
                                    gau1_cu_pos_x[blk_cnt + j]; /* Populate properly */
1684
149k
                                ps_cu_node->ps_sub_cu[j]->u2_y0 =
1685
149k
                                    gau1_cu_pos_y[blk_cnt + j]; /* Populate properly */
1686
149k
                                ps_cu_node->ps_sub_cu[j]->u1_cu_size = 8;
1687
1688
149k
                                ihevce_mode_eval_filtering(
1689
149k
                                    ps_cu_node->ps_sub_cu[j],
1690
149k
                                    ps_cu_node,
1691
149k
                                    ps_ctxt,
1692
149k
                                    ps_curr_src,
1693
149k
                                    best_ang_mode,
1694
149k
                                    &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1695
149k
                                    &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1696
149k
                                    !step2_bypass,
1697
149k
                                    1);
1698
1699
149k
                                if(i4_enable_4cu_16tu)
1700
54.7k
                                {
1701
54.7k
                                    ihevce_mode_eval_filtering(
1702
54.7k
                                        ps_cu_node->ps_sub_cu[j],
1703
54.7k
                                        ps_cu_node,
1704
54.7k
                                        ps_ctxt,
1705
54.7k
                                        ps_curr_src,
1706
54.7k
                                        best_ang_mode,
1707
54.7k
                                        &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1708
54.7k
                                        &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1709
54.7k
                                        !step2_bypass,
1710
54.7k
                                        0);
1711
54.7k
                                }
1712
95.2k
                                else
1713
95.2k
                                {
1714
                                    /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1715
95.2k
                                    memcpy(
1716
95.2k
                                        &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1717
95.2k
                                        &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1718
95.2k
                                        NUM_BEST_MODES);
1719
1720
                                    /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1721
95.2k
                                    memcpy(
1722
95.2k
                                        &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1723
95.2k
                                        &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
1724
95.2k
                                        NUM_BEST_MODES * sizeof(WORD32));
1725
95.2k
                                }
1726
1727
149k
                                child_cost[j] =
1728
149k
                                    MIN(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
1729
149k
                                        ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0]);
1730
1731
149k
                                child_cost_least += child_cost[j];
1732
1733
                                /* Select the best mode to be populated as top and left nbr depending on the
1734
                                4tu and 1tu cost */
1735
149k
                                if(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0] >
1736
149k
                                   ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0])
1737
13.5k
                                {
1738
13.5k
                                    ps_cu_node->ps_sub_cu[j]->best_mode =
1739
13.5k
                                        ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
1740
13.5k
                                }
1741
136k
                                else
1742
136k
                                {
1743
136k
                                    ps_cu_node->ps_sub_cu[j]->best_mode =
1744
136k
                                        ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
1745
136k
                                }
1746
149k
                                { /* Update the CTB nodes only for MAX - 1 CU nodes */
1747
149k
                                    WORD32 xA, yA, row, col;
1748
149k
                                    xA = ((ps_cu_node->ps_sub_cu[j]->u2_x0 << 3) >> 2) + 1;
1749
149k
                                    yA = ((ps_cu_node->ps_sub_cu[j]->u2_y0 << 3) >> 2) + 1;
1750
149k
                                    size = ps_cu_node->ps_sub_cu[j]->u1_cu_size >> 2;
1751
449k
                                    for(row = yA; row < (yA + size); row++)
1752
299k
                                    {
1753
899k
                                        for(col = xA; col < (xA + size); col++)
1754
599k
                                        {
1755
599k
                                            ps_ctxt->au1_ctb_mode_map[row][col] =
1756
599k
                                                ps_cu_node->ps_sub_cu[j]->best_mode;
1757
599k
                                        }
1758
299k
                                    }
1759
149k
                                }
1760
1761
                                /*collect individual child satd for final SATD/qp accum*/
1762
149k
                                child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
1763
1764
149k
                                ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
1765
1766
                                /* store the child 8x8 costs */
1767
149k
                                pi4_intra_8_cost[(j & 1) + (MAX_CU_IN_CTB_ROW * (j >> 1))] =
1768
149k
                                    child_cost[j];
1769
1770
                                /* set the CU valid flag */
1771
149k
                                ps_intra8_analyse->b1_valid_cu = 1;
1772
149k
                                ps_intra8_analyse->b1_enable_nxn = 0;
1773
1774
                                /* storing the modes to intra8  analyse */
1775
1776
                                /* store the best 8x8 modes 8x8 tu */
1777
149k
                                memcpy(
1778
149k
                                    &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
1779
149k
                                    &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
1780
149k
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
1781
149k
                                ps_intra8_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1782
1783
                                /* store the best 8x8 modes 4x4 tu */
1784
149k
                                memcpy(
1785
149k
                                    &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
1786
149k
                                    &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
1787
149k
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
1788
149k
                                ps_intra8_analyse->au1_best_modes_4x4_tu[NUM_BEST_MODES] = 255;
1789
1790
                                /* NXN modes not evaluated hence set to 255 */
1791
149k
                                memset(
1792
149k
                                    &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1793
149k
                                    255,
1794
149k
                                    sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1795
149k
                            }
1796
1797
37.4k
                            ihevce_set_nbr_map(
1798
37.4k
                                ps_ctxt->pu1_ctb_nbr_map,
1799
37.4k
                                ps_ctxt->i4_nbr_map_strd,
1800
37.4k
                                ps_cu_node->ps_sub_cu[0]->u2_x0 << 1,
1801
37.4k
                                ps_cu_node->ps_sub_cu[0]->u2_y0 << 1,
1802
37.4k
                                (ps_cu_node->ps_sub_cu[0]->u1_cu_size >> 1),
1803
37.4k
                                0);
1804
37.4k
                        }
1805
2.08k
#if 1  //DISBLE_CHILD_CU_EVAL_L0_IPE //1
1806
2.08k
                        else
1807
2.08k
                        {
1808
10.4k
                            for(j = 0; j < 4; j++)
1809
8.35k
                            {
1810
8.35k
                                intra8_analyse_t *ps_intra8_analyse;
1811
8.35k
                                ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
1812
8.35k
                                ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
1813
8.35k
                                ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
1814
                                /* NXN modes not evaluated hence set to 255 */
1815
8.35k
                                memset(
1816
8.35k
                                    &ps_intra8_analyse->au1_4x4_best_modes[0][0],
1817
8.35k
                                    255,
1818
8.35k
                                    sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
1819
1820
8.35k
                                ps_intra8_analyse->b1_valid_cu = 0;
1821
8.35k
                                ps_intra8_analyse->b1_enable_nxn = 0;
1822
8.35k
                            }
1823
2.08k
                            child_cost_least = MAX_INTRA_COST_IPE;
1824
2.08k
                        }
1825
39.5k
#endif
1826
                        //ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
1827
                        //ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
1828
1829
39.5k
                        ps_cu_node->ps_parent->u1_cu_size = 16;
1830
39.5k
                        ps_cu_node->ps_parent->u2_x0 =
1831
39.5k
                            gau1_cu_pos_x[blk_cnt]; /* Populate properly */
1832
39.5k
                        ps_cu_node->ps_parent->u2_y0 =
1833
39.5k
                            gau1_cu_pos_y[blk_cnt]; /* Populate properly */
1834
1835
                        //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
1836
1837
                        /* Eval for TUSize = CuSize */
1838
39.5k
                        ihevce_mode_eval_filtering(
1839
39.5k
                            ps_cu_node->ps_parent,
1840
39.5k
                            ps_cu_node,
1841
39.5k
                            ps_ctxt,
1842
39.5k
                            ps_curr_src,
1843
39.5k
                            26,
1844
39.5k
                            &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1845
39.5k
                            &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1846
39.5k
                            step2_bypass,
1847
39.5k
                            1);
1848
1849
39.5k
                        if(i4_enable_1cu_4tu)
1850
13.6k
                        {
1851
                            /* Eval for TUSize = CuSize/2 */
1852
13.6k
                            ihevce_mode_eval_filtering(
1853
13.6k
                                ps_cu_node->ps_parent,
1854
13.6k
                                ps_cu_node,
1855
13.6k
                                ps_ctxt,
1856
13.6k
                                ps_curr_src,
1857
13.6k
                                26,
1858
13.6k
                                &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1859
13.6k
                                &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1860
13.6k
                                step2_bypass,
1861
13.6k
                                0);
1862
13.6k
                        }
1863
25.9k
                        else
1864
25.9k
                        {
1865
                            /* 4TU not evaluated :  4tu modes set same as 1tu modes */
1866
25.9k
                            memcpy(
1867
25.9k
                                &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1868
25.9k
                                &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1869
25.9k
                                NUM_BEST_MODES);
1870
1871
                            /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
1872
25.9k
                            memcpy(
1873
25.9k
                                &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1874
25.9k
                                &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
1875
25.9k
                                NUM_BEST_MODES * sizeof(WORD32));
1876
25.9k
                        }
1877
1878
39.5k
                        ps_ctxt->u1_disable_child_cu_decide = 0;
1879
39.5k
                        step2_bypass = 1;
1880
1881
                        /* Update parent cost */
1882
39.5k
                        parent_cost =
1883
39.5k
                            MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
1884
39.5k
                                ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
1885
1886
                        /* Select the best mode to be populated as top and left nbr depending on the
1887
                        4tu and 1tu cost */
1888
39.5k
                        if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
1889
39.5k
                           ps_cu_node->ps_parent->au4_best_cost_1tu[0])
1890
3.80k
                        {
1891
3.80k
                            ps_cu_node->ps_parent->best_mode =
1892
3.80k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[0];
1893
3.80k
                        }
1894
35.7k
                        else
1895
35.7k
                        {
1896
35.7k
                            ps_cu_node->ps_parent->best_mode =
1897
35.7k
                                ps_cu_node->ps_parent->au1_best_mode_4tu[0];
1898
35.7k
                        }
1899
1900
                        /* store the 16x16 cost */
1901
39.5k
                        *pi4_intra_16_cost = parent_cost;
1902
1903
                        /* accumulate the 32x32 cost */
1904
39.5k
                        if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
1905
10.2k
                        {
1906
10.2k
                            *pi4_intra_32_cost = parent_cost;
1907
10.2k
                        }
1908
29.3k
                        else
1909
29.3k
                        {
1910
29.3k
                            *pi4_intra_32_cost += parent_cost;
1911
29.3k
                        }
1912
1913
                        /* set the CU valid flag */
1914
39.5k
                        ps_intra16_analyse->b1_valid_cu = 1;
1915
1916
                        /* storing the modes to intra 16 analyse */
1917
39.5k
                        {
1918
                            /* store the best 16x16 modes 16x16 tu */
1919
39.5k
                            memcpy(
1920
39.5k
                                &ps_intra16_analyse->au1_best_modes_16x16_tu[0],
1921
39.5k
                                &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
1922
39.5k
                                sizeof(UWORD8) * NUM_BEST_MODES);
1923
39.5k
                            ps_intra16_analyse->au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
1924
1925
                            /* store the best 16x16 modes 8x8 tu */
1926
39.5k
                            memcpy(
1927
39.5k
                                &ps_intra16_analyse->au1_best_modes_8x8_tu[0],
1928
39.5k
                                &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
1929
39.5k
                                sizeof(UWORD8) * NUM_BEST_MODES);
1930
39.5k
                            ps_intra16_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
1931
39.5k
                        }
1932
1933
39.5k
                        parent_best_mode = ps_cu_node->ps_parent->best_mode;
1934
39.5k
                        if(parent_cost <=
1935
39.5k
                           child_cost_least + (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >>
1936
39.5k
                                               LAMBDA_Q_SHIFT))  //|| identical_modes)
1937
26.8k
                        {
1938
26.8k
                            WORD32 i4_q_scale_q3_mod;
1939
26.8k
                            WORD8 i1_cu_possible_qp;
1940
26.8k
                            WORD32 i4_act_factor;
1941
                            //choose parent CU
1942
1943
26.8k
                            ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
1944
1945
                            /* set the 16x16 non split flag */
1946
26.8k
                            ps_intra16_analyse->b1_split_flag = 0;
1947
1948
                            /*As 16*16 has won, pick L1 8x8 qp which maps
1949
                            to L0 16x16 Qp*/
1950
26.8k
                            ASSERT(((blk_cnt >> 4) & 3) == (blk_cnt >> 4));
1951
26.8k
                            ASSERT(ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0] != -2);
1952
26.8k
                            i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1953
26.8k
                                ps_ctxt->i4_qscale,
1954
26.8k
                                ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0],
1955
26.8k
                                ps_ctxt->ld_curr_frame_8x8_log_avg[0],
1956
26.8k
                                f_strength,
1957
26.8k
                                &i4_act_factor,
1958
26.8k
                                &i4_q_scale_q3_mod,
1959
26.8k
                                ps_ctxt->ps_rc_quant_ctxt);
1960
1961
                            /* cost accumalation of best cu size candiate */
1962
26.8k
                            i8_frame_acc_satd_cost += parent_cost;
1963
1964
                            /* satd and mpm bits accumalation of best cu size candiate */
1965
26.8k
                            i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
1966
1967
                            /*satd/mod_qp accumulation of best cu */
1968
26.8k
                            i8_frame_acc_satd_by_modqp_q10 +=
1969
26.8k
                                ((LWORD64)ps_cu_node->ps_parent->best_satd
1970
26.8k
                                 << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
1971
26.8k
                                i4_q_scale_q3_mod;
1972
1973
                            /* Accumalate mode bits for all child blocks */
1974
26.8k
                            i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;
1975
1976
26.8k
                            blk_cnt += 4;
1977
26.8k
                            ps_ed_blk_l1 += 4;
1978
                            //ps_row_cu++;
1979
26.8k
                        }
1980
12.7k
                        else
1981
12.7k
                        {
1982
                            //choose child CU
1983
12.7k
                            WORD8 i1_cu_possible_qp;
1984
12.7k
                            WORD32 i4_act_factor;
1985
12.7k
                            WORD32 i4_q_scale_q3_mod;
1986
1987
12.7k
                            ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
1988
12.7k
                            ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][1] != -2);
1989
12.7k
                            i1_cu_possible_qp = ihevce_cu_level_qp_mod(
1990
12.7k
                                ps_ctxt->i4_qscale,
1991
12.7k
                                ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][1],
1992
12.7k
                                ps_ctxt->ld_curr_frame_8x8_log_avg[1],
1993
12.7k
                                f_strength,
1994
12.7k
                                &i4_act_factor,
1995
12.7k
                                &i4_q_scale_q3_mod,
1996
12.7k
                                ps_ctxt->ps_rc_quant_ctxt);
1997
1998
                            /* set the 16x16 split flag */
1999
12.7k
                            ps_intra16_analyse->b1_split_flag = 1;
2000
2001
63.6k
                            for(j = 0; j < 4; j++)
2002
50.9k
                            {
2003
50.9k
                                ihevce_update_cand_list(
2004
50.9k
                                    ps_cu_node->ps_sub_cu[j], ps_ed_blk_l1, ps_ctxt);
2005
2006
50.9k
                                if((IHEVCE_QUALITY_P3 > i4_quality_preset))
2007
24.1k
                                {
2008
24.1k
                                    WORD32 k;
2009
24.1k
                                    intra8_analyse_t *ps_intra8_analyse;
2010
24.1k
                                    ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
2011
2012
120k
                                    for(k = 0; k < 4; k++)
2013
96.4k
                                    {
2014
                                        /* Populate best 3 nxn modes */
2015
96.4k
                                        ps_intra8_analyse->au1_4x4_best_modes[k][0] =
2016
96.4k
                                            ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
2017
96.4k
                                        ps_intra8_analyse->au1_4x4_best_modes[k][1] =
2018
96.4k
                                            ps_cu_node->ps_sub_cu[j]
2019
96.4k
                                                ->au1_best_mode_4tu[1];  //(ps_ed + 1)->best_mode;
2020
96.4k
                                        ps_intra8_analyse->au1_4x4_best_modes[k][2] =
2021
96.4k
                                            ps_cu_node->ps_sub_cu[j]
2022
96.4k
                                                ->au1_best_mode_4tu[2];  //(ps_ed + 2)->best_mode;
2023
96.4k
                                        ps_intra8_analyse->au1_4x4_best_modes[k][3] = 255;
2024
96.4k
                                    }
2025
24.1k
                                }
2026
                                /*accum satd/qp for all child block*/
2027
50.9k
                                i8_frame_acc_satd_by_modqp_q10 +=
2028
50.9k
                                    ((LWORD64)child_satd[j]
2029
50.9k
                                     << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2030
50.9k
                                    i4_q_scale_q3_mod;
2031
2032
                                /* Accumalate mode bits for all child blocks */
2033
50.9k
                                i8_frame_acc_mode_bits_cost +=
2034
50.9k
                                    ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
2035
2036
                                /* satd and mpm bits accumalation of best cu size candiate */
2037
50.9k
                                i4_ctb_acc_satd += child_satd[j];
2038
2039
50.9k
                                blk_cnt += 1;
2040
50.9k
                                ps_ed_blk_l1 += 1;
2041
                                //ps_row_cu++;
2042
50.9k
                            }
2043
2044
                            /* cost accumalation of best cu size candiate */
2045
12.7k
                            i8_frame_acc_satd_cost += child_cost_least;
2046
12.7k
                        }
2047
2048
39.5k
                    }  //else of EIID
2049
41.8k
#endif
2050
41.8k
                }  // if(merge_16x16_l1)
2051
                /* MAX CU SIZE 8x8 */
2052
55.4k
                else
2053
55.4k
                {
2054
#if IP_DBG_L1_l2
2055
                    for(i = 0; i < 4; i++)
2056
                    {
2057
                        ps_cu_node->ps_parent->u1_cu_size = 8;
2058
                        ps_cu_node->ps_parent->u2_x0 =
2059
                            gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2060
                        ps_cu_node->ps_parent->u2_y0 =
2061
                            gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2062
                        ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
2063
2064
                        ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2065
                        blk_cnt++;
2066
                        ps_ed_blk_l1++;
2067
                        ps_row_cu++;
2068
                        merge_64x64 = 0;
2069
                    }
2070
#else
2071
2072
                    /* EIID: Skip all 4 8x8 block if L1 decisions says skip intra */
2073
55.4k
                    if(ps_ed_blk_l1->intra_or_inter == 2 && (ps_ctxt->i4_slice_type != ISLICE))
2074
3.24k
                    {
2075
3.24k
                        WORD32 i4_q_scale_q3_mod;
2076
3.24k
                        WORD8 i1_cu_possible_qp;
2077
3.24k
                        WORD32 i4_act_factor;
2078
2079
3.24k
                        merge_64x64 = 0;
2080
2081
3.24k
                        ps_intra32_analyse->b1_merge_flag = 0;
2082
2083
3.24k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 255;
2084
3.24k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[1] = 255;
2085
3.24k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[2] = 255;
2086
2087
3.24k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 255;
2088
3.24k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[1] = 255;
2089
3.24k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[2] = 255;
2090
3.24k
                        ps_intra16_analyse->b1_split_flag = 1;
2091
3.24k
                        ps_intra16_analyse->b1_valid_cu = 0;
2092
3.24k
                        ps_intra16_analyse->b1_merge_flag = 0;
2093
2094
16.2k
                        for(i = 0; i < 4; i++)
2095
12.9k
                        {
2096
12.9k
                            intra8_analyse_t *ps_intra8_analyse;
2097
12.9k
                            WORD32 ctr_sub_cu;
2098
2099
12.9k
                            cu_pos_x = gau1_cu_pos_x[blk_cnt];
2100
12.9k
                            cu_pos_y = gau1_cu_pos_y[blk_cnt];
2101
2102
12.9k
                            if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
2103
12.4k
                            {
2104
12.4k
                                ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
2105
2106
12.4k
                                ps_intra8_analyse->b1_valid_cu = 0;
2107
12.4k
                                ps_intra8_analyse->b1_enable_nxn = 0;
2108
12.4k
                                ps_intra8_analyse->au1_4x4_best_modes[0][0] = 255;
2109
12.4k
                                ps_intra8_analyse->au1_4x4_best_modes[1][0] = 255;
2110
12.4k
                                ps_intra8_analyse->au1_4x4_best_modes[2][0] = 255;
2111
12.4k
                                ps_intra8_analyse->au1_4x4_best_modes[3][0] = 255;
2112
12.4k
                                ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
2113
12.4k
                                ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
2114
2115
12.4k
                                ps_cu_node->ps_parent->u1_cu_size = 8;
2116
12.4k
                                ps_cu_node->ps_parent->u2_x0 =
2117
12.4k
                                    gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2118
12.4k
                                ps_cu_node->ps_parent->u2_y0 =
2119
12.4k
                                    gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2120
12.4k
                                ps_cu_node->ps_parent->best_mode =
2121
12.4k
                                    INTRA_DC;  //ps_ed_blk_l1->best_mode;
2122
2123
                                /* fill in the first modes as invalid */
2124
2125
12.4k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
2126
12.4k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
2127
12.4k
                                    INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
2128
12.4k
                                ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
2129
2130
12.4k
                                ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
2131
12.4k
                                ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
2132
12.4k
                                ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
2133
2134
12.4k
                                ihevce_update_cand_list(
2135
12.4k
                                    ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2136
2137
                                //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
2138
                                //ps_row_cu->u1_num_intra_rdopt_cands = 0;
2139
2140
62.0k
                                for(ctr_sub_cu = 0; ctr_sub_cu < 4; ctr_sub_cu++)
2141
49.6k
                                {
2142
49.6k
                                    ps_cu_node->ps_sub_cu[ctr_sub_cu]->au1_best_mode_1tu[0] =
2143
49.6k
                                        INTRA_DC;
2144
49.6k
                                    ps_cu_node->ps_sub_cu[ctr_sub_cu]->au1_best_mode_4tu[0] =
2145
49.6k
                                        INTRA_DC;
2146
49.6k
                                    ps_cu_node->ps_sub_cu[ctr_sub_cu]->au4_best_cost_1tu[0] =
2147
49.6k
                                        MAX_INTRA_COST_IPE;
2148
2149
49.6k
                                    ps_cu_node->ps_sub_cu[ctr_sub_cu]->au4_best_cost_4tu[0] =
2150
49.6k
                                        MAX_INTRA_COST_IPE;
2151
49.6k
                                    ps_cu_node->ps_sub_cu[ctr_sub_cu]->best_cost =
2152
49.6k
                                        MAX_INTRA_COST_IPE;
2153
49.6k
                                }
2154
2155
12.4k
                                pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2156
12.4k
                                    MAX_INTRA_COST_IPE;
2157
2158
12.4k
                                ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
2159
12.4k
                                ASSERT(ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1] != -2);
2160
12.4k
                                i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2161
12.4k
                                    ps_ctxt->i4_qscale,
2162
12.4k
                                    ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1],
2163
12.4k
                                    ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2164
12.4k
                                    f_strength,
2165
12.4k
                                    &i4_act_factor,
2166
12.4k
                                    &i4_q_scale_q3_mod,
2167
12.4k
                                    ps_ctxt->ps_rc_quant_ctxt);
2168
2169
                                /* set neighbours even if intra is not evaluated, since source is always available. */
2170
12.4k
                                ihevce_set_nbr_map(
2171
12.4k
                                    ps_ctxt->pu1_ctb_nbr_map,
2172
12.4k
                                    ps_ctxt->i4_nbr_map_strd,
2173
12.4k
                                    ps_cu_node->ps_parent->u2_x0 << 1,
2174
12.4k
                                    ps_cu_node->ps_parent->u2_y0 << 1,
2175
12.4k
                                    (ps_cu_node->ps_parent->u1_cu_size >> 2),
2176
12.4k
                                    1);
2177
2178
                                //ps_row_cu++;
2179
12.4k
                            }
2180
12.9k
                            blk_cnt++;
2181
12.9k
                            ps_ed_blk_l1++;
2182
12.9k
                        }
2183
3.24k
                    }
2184
52.2k
                    else
2185
52.2k
                    {
2186
                        //cu_intra_cand_t *ps_cu_intra_cand;
2187
52.2k
                        WORD8 i1_cu_possible_qp;
2188
52.2k
                        WORD32 i4_act_factor;
2189
52.2k
                        WORD32 i4_q_scale_q3_mod;
2190
2191
52.2k
                        ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
2192
52.2k
                        ASSERT(ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1] != -2);
2193
52.2k
                        i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2194
52.2k
                            ps_ctxt->i4_qscale,
2195
52.2k
                            ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1],
2196
52.2k
                            ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2197
52.2k
                            f_strength,
2198
52.2k
                            &i4_act_factor,
2199
52.2k
                            &i4_q_scale_q3_mod,
2200
52.2k
                            ps_ctxt->ps_rc_quant_ctxt);
2201
2202
                        /* 64x64 merge is not possible */
2203
52.2k
                        merge_64x64 = 0;
2204
2205
52.2k
                        ps_intra32_analyse->b1_merge_flag = 0;
2206
2207
52.2k
                        ps_intra16_analyse->b1_merge_flag = 0;
2208
2209
                        /* by default 16x16 modes are set to default values DC and Planar */
2210
52.2k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 0;
2211
52.2k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[1] = 1;
2212
52.2k
                        ps_intra16_analyse->au1_best_modes_8x8_tu[2] = 255;
2213
2214
52.2k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 0;
2215
52.2k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[1] = 1;
2216
52.2k
                        ps_intra16_analyse->au1_best_modes_16x16_tu[2] = 255;
2217
52.2k
                        ps_intra16_analyse->b1_split_flag = 1;
2218
52.2k
                        ps_intra16_analyse->b1_valid_cu = 1;
2219
2220
261k
                        for(i = 0; i < 4; i++)
2221
208k
                        {
2222
208k
                            intra8_analyse_t *ps_intra8_analyse;
2223
208k
                            cu_pos_x = gau1_cu_pos_x[blk_cnt];
2224
208k
                            cu_pos_y = gau1_cu_pos_y[blk_cnt];
2225
208k
                            if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
2226
187k
                            {
2227
                                //ps_cu_intra_cand = &ps_row_cu->s_cu_intra_cand;
2228
                                //ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
2229
2230
                                //ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
2231
2232
187k
                                child_cost_least = 0;
2233
2234
187k
                                ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
2235
187k
                                ps_cu_node->ps_parent->u1_cu_size = 8;
2236
187k
                                ps_cu_node->ps_parent->u2_x0 =
2237
187k
                                    gau1_cu_pos_x[blk_cnt]; /* Populate properly */
2238
187k
                                ps_cu_node->ps_parent->u2_y0 =
2239
187k
                                    gau1_cu_pos_y[blk_cnt]; /* Populate properly */
2240
2241
                                //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
2242
2243
                                /*EARLY DECISION 8x8 block */
2244
187k
                                ihevce_pu_calc_8x8_blk(
2245
187k
                                    ps_curr_src, ps_ctxt, ps_cu_node, ps_ctxt->ps_func_selector);
2246
936k
                                for(j = 0; j < 4; j++)
2247
749k
                                {
2248
749k
                                    child_cost_least += ps_cu_node->ps_sub_cu[j]->best_cost;
2249
749k
                                    child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
2250
749k
                                }
2251
2252
                                /* Based on the flag, CU = 4TU modes decision can be disabled, CU = 4PU is retained */
2253
187k
                                if(0 == ps_ctxt->u1_disable_child_cu_decide)
2254
187k
                                {
2255
187k
                                    ihevce_set_nbr_map(
2256
187k
                                        ps_ctxt->pu1_ctb_nbr_map,
2257
187k
                                        ps_ctxt->i4_nbr_map_strd,
2258
187k
                                        ps_cu_node->ps_parent->u2_x0 << 1,
2259
187k
                                        ps_cu_node->ps_parent->u2_y0 << 1,
2260
187k
                                        (ps_cu_node->ps_parent->u1_cu_size >> 2),
2261
187k
                                        0);
2262
2263
                                    //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
2264
2265
                                    /* Eval for TUSize = CuSize */
2266
187k
                                    ihevce_mode_eval_filtering(
2267
187k
                                        ps_cu_node->ps_parent,
2268
187k
                                        ps_cu_node,
2269
187k
                                        ps_ctxt,
2270
187k
                                        ps_curr_src,
2271
187k
                                        26,
2272
187k
                                        &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2273
187k
                                        &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2274
187k
                                        step2_bypass,
2275
187k
                                        1);
2276
2277
187k
                                    if(i4_enable_1cu_4tu)
2278
98.8k
                                    {
2279
                                        /* Eval for TUSize = CuSize/2 */
2280
98.8k
                                        ihevce_mode_eval_filtering(
2281
98.8k
                                            ps_cu_node->ps_parent,
2282
98.8k
                                            ps_cu_node,
2283
98.8k
                                            ps_ctxt,
2284
98.8k
                                            ps_curr_src,
2285
98.8k
                                            26,
2286
98.8k
                                            &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2287
98.8k
                                            &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2288
98.8k
                                            step2_bypass,
2289
98.8k
                                            0);
2290
98.8k
                                    }
2291
88.4k
                                    else
2292
88.4k
                                    {
2293
                                        /* 4TU not evaluated :  4tu modes set same as 1tu modes */
2294
88.4k
                                        memcpy(
2295
88.4k
                                            &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2296
88.4k
                                            &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2297
88.4k
                                            NUM_BEST_MODES);
2298
2299
                                        /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
2300
88.4k
                                        memcpy(
2301
88.4k
                                            &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2302
88.4k
                                            &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2303
88.4k
                                            NUM_BEST_MODES * sizeof(WORD32));
2304
88.4k
                                    }
2305
2306
                                    /* Update parent cost */
2307
187k
                                    parent_cost =
2308
187k
                                        MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
2309
187k
                                            ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
2310
2311
                                    /* Select the best mode to be populated as top and left nbr depending on the
2312
                            4tu and 1tu cost */
2313
187k
                                    if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
2314
187k
                                       ps_cu_node->ps_parent->au4_best_cost_1tu[0])
2315
28.6k
                                    {
2316
28.6k
                                        ps_cu_node->ps_parent->best_mode =
2317
28.6k
                                            ps_cu_node->ps_parent->au1_best_mode_1tu[0];
2318
28.6k
                                    }
2319
158k
                                    else
2320
158k
                                    {
2321
158k
                                        ps_cu_node->ps_parent->best_mode =
2322
158k
                                            ps_cu_node->ps_parent->au1_best_mode_4tu[0];
2323
158k
                                    }
2324
187k
                                }
2325
2326
                                /* set the CU valid flag */
2327
187k
                                ps_intra8_analyse->b1_valid_cu = 1;
2328
187k
                                ps_intra8_analyse->b1_enable_nxn = 0;
2329
2330
                                /* storing the modes to intra 8 analyse */
2331
2332
                                /* store the best 8x8 modes 8x8 tu */
2333
187k
                                memcpy(
2334
187k
                                    &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
2335
187k
                                    &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2336
187k
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
2337
187k
                                ps_intra8_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
2338
2339
                                /* store the best 8x8 modes 4x4 tu */
2340
187k
                                memcpy(
2341
187k
                                    &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
2342
187k
                                    &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
2343
187k
                                    sizeof(UWORD8) * (NUM_BEST_MODES));
2344
187k
                                ps_intra8_analyse->au1_best_modes_4x4_tu[NUM_BEST_MODES] = 255;
2345
2346
                                /*As 8*8 has won, pick L1 4x4 qp which is equal to
2347
                                L1 8x8 Qp*/
2348
                                //ps_row_cu->u1_cu_possible_qp[0] = u1_cu_possible_qp;
2349
                                //ps_row_cu->i4_act_factor[0][1] = i4_act_factor;
2350
2351
187k
                                parent_best_mode = ps_cu_node->ps_parent->best_mode;
2352
187k
                                if(parent_cost <=
2353
187k
                                   child_cost_least +
2354
187k
                                       (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >> LAMBDA_Q_SHIFT))
2355
61.1k
                                {
2356
                                    /*CU = 4TU */
2357
61.1k
                                    ihevce_update_cand_list(
2358
61.1k
                                        ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
2359
2360
                                    /* store the child 8x8 costs */
2361
61.1k
                                    pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2362
61.1k
                                        parent_cost;
2363
2364
                                    /* cost accumalation of best cu size candiate */
2365
61.1k
                                    i8_frame_acc_satd_cost += parent_cost;
2366
2367
                                    /*satd/mod_qp accumulation of best cu */
2368
61.1k
                                    i8_frame_acc_satd_by_modqp_q10 +=
2369
61.1k
                                        ((LWORD64)ps_cu_node->ps_parent->best_satd
2370
61.1k
                                         << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2371
61.1k
                                        i4_q_scale_q3_mod;
2372
2373
                                    /* Accumalate mode bits for all child blocks */
2374
61.1k
                                    i8_frame_acc_mode_bits_cost +=
2375
61.1k
                                        ps_cu_node->ps_parent->u2_mode_bits_cost;
2376
2377
                                    /* satd and mpm bits accumalation of best cu size candiate */
2378
61.1k
                                    i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
2379
2380
                                    /* accumulate the 16x16 cost*/
2381
61.1k
                                    if(MAX_INTRA_COST_IPE == *pi4_intra_16_cost)
2382
17.0k
                                    {
2383
17.0k
                                        *pi4_intra_16_cost = parent_cost;
2384
17.0k
                                    }
2385
44.0k
                                    else
2386
44.0k
                                    {
2387
44.0k
                                        *pi4_intra_16_cost += parent_cost;
2388
44.0k
                                    }
2389
2390
                                    /* accumulate the 32x32 cost*/
2391
61.1k
                                    if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
2392
6.15k
                                    {
2393
6.15k
                                        *pi4_intra_32_cost = parent_cost;
2394
6.15k
                                    }
2395
54.9k
                                    else
2396
54.9k
                                    {
2397
54.9k
                                        *pi4_intra_32_cost += parent_cost;
2398
54.9k
                                    }
2399
61.1k
                                }
2400
126k
                                else
2401
126k
                                {
2402
                                    /*CU = 4PU*/
2403
                                    //ps_row_cu->b3_cu_pos_x = (UWORD8) ps_cu_node->ps_parent->u2_x0;
2404
                                    //ps_row_cu->b3_cu_pos_y = (UWORD8) ps_cu_node->ps_parent->u2_y0;
2405
                                    //ps_row_cu->u1_cu_size  = ps_cu_node->ps_parent->u1_cu_size;
2406
2407
                                    /* store the child 8x8 costs woth 4x4 pu summed cost */
2408
126k
                                    pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
2409
126k
                                        (child_cost_least);
2410
2411
                                    /* accumulate the 16x16 cost*/
2412
126k
                                    if(MAX_INTRA_COST_IPE == *pi4_intra_16_cost)
2413
35.1k
                                    {
2414
35.1k
                                        *pi4_intra_16_cost = child_cost_least;
2415
35.1k
                                    }
2416
91.0k
                                    else
2417
91.0k
                                    {
2418
91.0k
                                        *pi4_intra_16_cost += child_cost_least;
2419
91.0k
                                    }
2420
2421
                                    /* cost accumalation of best cu size candiate */
2422
126k
                                    i8_frame_acc_satd_cost += child_cost_least;
2423
2424
630k
                                    for(j = 0; j < 4; j++)
2425
504k
                                    {
2426
                                        /*satd/qp accumualtion*/
2427
504k
                                        i8_frame_acc_satd_by_modqp_q10 +=
2428
504k
                                            ((LWORD64)child_satd[j]
2429
504k
                                             << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2430
504k
                                            i4_q_scale_q3_mod;
2431
2432
                                        /* Accumalate mode bits for all child blocks */
2433
504k
                                        i8_frame_acc_mode_bits_cost +=
2434
504k
                                            ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
2435
2436
                                        /* satd and mpm bits accumalation of best cu size candiate */
2437
504k
                                        i4_ctb_acc_satd += child_satd[j];
2438
504k
                                    }
2439
2440
                                    /* accumulate the 32x32 cost*/
2441
126k
                                    if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
2442
11.0k
                                    {
2443
11.0k
                                        *pi4_intra_32_cost = child_cost_least;
2444
11.0k
                                    }
2445
115k
                                    else
2446
115k
                                    {
2447
115k
                                        *pi4_intra_32_cost += child_cost_least;
2448
115k
                                    }
2449
2450
126k
                                    ps_intra8_analyse->b1_enable_nxn = 1;
2451
2452
                                    /* Insert the best 8x8 modes unconditionally */
2453
2454
126k
                                    x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
2455
126k
                                    y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
2456
126k
                                    size = ps_cu_node->u1_cu_size >> 2;
2457
2458
126k
                                    ps_ctxt->au1_ctb_mode_map[y][x] =
2459
126k
                                        ps_cu_node->ps_sub_cu[0]->best_mode;
2460
126k
                                    ps_ctxt->au1_ctb_mode_map[y][x + 1] =
2461
126k
                                        ps_cu_node->ps_sub_cu[1]->best_mode;
2462
126k
                                    ps_ctxt->au1_ctb_mode_map[y + 1][x] =
2463
126k
                                        ps_cu_node->ps_sub_cu[2]->best_mode;
2464
126k
                                    ps_ctxt->au1_ctb_mode_map[y + 1][x + 1] =
2465
126k
                                        ps_cu_node->ps_sub_cu[3]->best_mode;
2466
126k
                                }
2467
                                /* NXN mode population */
2468
936k
                                for(j = 0; j < 4; j++)
2469
749k
                                {
2470
749k
                                    cand_mode_list[0] =
2471
749k
                                        ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
2472
749k
                                    cand_mode_list[1] =
2473
749k
                                        ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[1];
2474
749k
                                    cand_mode_list[2] =
2475
749k
                                        ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[2];
2476
2477
749k
                                    if(1)
2478
749k
                                    {
2479
                                        /* Populate best 3 nxn modes */
2480
749k
                                        ps_intra8_analyse->au1_4x4_best_modes[j][0] =
2481
749k
                                            cand_mode_list[0];
2482
749k
                                        ps_intra8_analyse->au1_4x4_best_modes[j][1] =
2483
749k
                                            cand_mode_list[1];  //(ps_ed + 1)->best_mode;
2484
749k
                                        ps_intra8_analyse->au1_4x4_best_modes[j][2] =
2485
749k
                                            cand_mode_list[2];  //(ps_ed + 2)->best_mode;
2486
749k
                                        ps_intra8_analyse->au1_4x4_best_modes[j][3] = 255;
2487
2488
                                        //memcpy(ps_intra8_analyse->au1_4x4_best_modes[j], ps_row_cu->s_cu_intra_cand.au1_intra_luma_modes_nxn[j], 4);
2489
749k
                                    }
2490
                                    /* For HQ, all 35 modes to be used for RDOPT, removed from here for memory clean-up */
2491
2492
0
                                    else /* IHEVCE_QUALITY_P0 == i4_quality_preset */
2493
0
                                    {
2494
                                        /* To indicate to enc loop that NXN is enabled in HIGH QUALITY fior CU 8x8*/
2495
0
                                        ps_intra8_analyse->au1_4x4_best_modes[j][0] = 0;
2496
0
                                    }
2497
2498
749k
                                    ps_intra8_analyse
2499
749k
                                        ->au1_4x4_best_modes[j][MAX_INTRA_CU_CANDIDATES] = 255;
2500
749k
                                }
2501
2502
                                //ps_row_cu++;
2503
187k
                            }
2504
21.5k
                            else
2505
21.5k
                            {
2506
                                /* For Incomplete CTB, 16x16 is not valid */
2507
21.5k
                                ps_intra16_analyse->b1_valid_cu = 0;
2508
21.5k
                            }
2509
208k
                            blk_cnt++;
2510
208k
                            ps_ed_blk_l1++;
2511
208k
                        }
2512
                        //ps_ed_blk_l2 ++;
2513
52.2k
                    }  //else of EIID
2514
55.4k
#endif
2515
55.4k
                }
2516
117k
            }
2517
186k
            else
2518
186k
            {
2519
                /* For incomplete CTB, init valid CU to 0 */
2520
186k
                ps_ed_blk_l1++;
2521
186k
                ps_intra32_analyse->b1_valid_cu = 0;
2522
186k
                ps_intra16_analyse[0].b1_valid_cu = 0;
2523
186k
                blk_cnt++;
2524
186k
                merge_64x64 = 0;
2525
186k
            }
2526
304k
        } while(blk_cnt != MAX_CTB_SIZE);
2527
        /* if 64x64 merge is possible then check for 32x32 having same best modes */
2528
14.0k
        if(1 == merge_64x64)
2529
2.28k
        {
2530
2.28k
            WORD32 act_mode = au1_best_32x32_modes[0];
2531
2532
2.28k
            ps_ed_blk_l2 = ps_ed_l2_ctb;
2533
2.28k
            best_mode = ps_ed_blk_l2->best_mode;
2534
2.28k
            merge_64x64 =
2535
2.28k
                ((act_mode == au1_best_32x32_modes[0]) + (act_mode == au1_best_32x32_modes[1]) +
2536
2.28k
                     (act_mode == au1_best_32x32_modes[2]) +
2537
2.28k
                     (act_mode == au1_best_32x32_modes[3]) ==
2538
2.28k
                 4);
2539
2.28k
            if(merge_64x64 == 1)
2540
2.02k
                best_mode = au1_best_32x32_modes[0];
2541
258
            else
2542
258
                best_mode = ps_ed_blk_l2->best_mode;
2543
            /* All 32x32 costs are accumalated to 64x64 cost */
2544
2.28k
            ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = 0;
2545
11.4k
            for(i = 0; i < 4; i++)
2546
9.12k
            {
2547
9.12k
                ps_l0_ipe_out_ctb->i4_best64x64_intra_cost +=
2548
9.12k
                    ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i];
2549
9.12k
            }
2550
2551
            /* If all modes of 32x32 block is not same */
2552
2.28k
            if(0 == merge_64x64)
2553
258
            {
2554
                /*Compute CHILD cost for 32x32 */
2555
258
                WORD32 child_cost_64x64 = au4_best_32x32_cost[0] + au4_best_32x32_cost[1] +
2556
258
                                          au4_best_32x32_cost[2] + au4_best_32x32_cost[3];
2557
258
                WORD32 cost = MAX_INTRA_COST_IPE;
2558
2559
258
                WORD32 best_mode_temp = 0;
2560
                /*Compute 64x64 cost for each mode of 32x32*/
2561
1.29k
                for(i = 0; i < 4; i++)
2562
1.03k
                {
2563
1.03k
                    WORD32 mode = au1_best_32x32_modes[i];
2564
1.03k
                    if(mode < 2)
2565
562
                        mode = 26;
2566
1.03k
                    ps_cu_node->ps_parent->u1_cu_size = 64;
2567
1.03k
                    ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[0]; /* Populate properly */
2568
1.03k
                    ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[0]; /* Populate properly */
2569
2570
1.03k
                    ihevce_set_nbr_map(
2571
1.03k
                        ps_ctxt->pu1_ctb_nbr_map,
2572
1.03k
                        ps_ctxt->i4_nbr_map_strd,
2573
1.03k
                        (ps_cu_node->ps_parent->u2_x0 << 1),
2574
1.03k
                        (ps_cu_node->ps_parent->u2_y0 << 1),
2575
1.03k
                        (ps_cu_node->ps_parent->u1_cu_size >> 2),
2576
1.03k
                        0);
2577
2578
1.03k
                    ihevce_mode_eval_filtering(
2579
1.03k
                        ps_cu_node->ps_parent,
2580
1.03k
                        ps_cu_node,
2581
1.03k
                        ps_ctxt,
2582
1.03k
                        ps_curr_src,
2583
1.03k
                        mode,
2584
1.03k
                        &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
2585
1.03k
                        &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
2586
1.03k
                        !step2_bypass,
2587
1.03k
                        0);
2588
2589
1.03k
                    parent_cost = ps_cu_node->ps_parent->best_cost;
2590
1.03k
                    if(cost > parent_cost)
2591
260
                    {
2592
260
                        cost = parent_cost;
2593
260
                        best_mode_temp = ps_cu_node->ps_parent->best_mode;
2594
260
                    }
2595
1.03k
                }
2596
258
                if(cost < child_cost_64x64)
2597
149
                {
2598
149
                    merge_64x64 = 1;
2599
149
                    best_mode = best_mode_temp;
2600
2601
                    /* Update 64x64 cost if CU 64x64 is chosen  */
2602
149
                    ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = cost;
2603
2604
                    /* Accumalate the least cost for CU 64x64 */
2605
149
                    i8_frame_acc_satd_cost = cost;
2606
149
                    i8_frame_acc_mode_bits_cost = ps_cu_node->ps_parent->u2_mode_bits_cost;
2607
2608
                    /* satd and mpm bits accumalation of best cu size candiate */
2609
149
                    i4_ctb_acc_satd = ps_cu_node->ps_parent->best_satd;
2610
149
                }
2611
258
            }
2612
2.28k
        }
2613
2614
14.0k
        if(merge_64x64)
2615
2.17k
        {
2616
2.17k
            WORD32 i, j;
2617
2.17k
            intra32_analyse_t *ps_intra32_analyse;
2618
2.17k
            intra16_analyse_t *ps_intra16_analyse;
2619
2.17k
            WORD32 row, col;
2620
2.17k
            WORD32 i4_q_scale_q3_mod;
2621
2.17k
            WORD8 i1_cu_possible_qp;
2622
2.17k
            WORD32 i4_act_factor;
2623
            //ps_row_cu = ps_curr_cu;
2624
2.17k
            ps_ctb_out->u4_cu_split_flags = 0x0;
2625
2.17k
            ps_ed_blk_l1 = ps_ed_l1_ctb;
2626
2.17k
            ps_ed_blk_l2 = ps_ed_l2_ctb;
2627
2628
2.17k
            ps_l0_ipe_out_ctb->u1_split_flag = 0;
2629
2630
            /* If CU size of 64x64 is chosen, disbale all the 16x16 flag*/
2631
10.8k
            for(i = 0; i < 4; i++)
2632
8.68k
            {
2633
                /* get the corresponding intra 32 analyse pointer  use (blk_cnt / 16) */
2634
                /* blk cnt is in terms of 8x8 units so a 32x32 will have 16 8x8 units */
2635
8.68k
                ps_intra32_analyse = &ps_l0_ipe_out_ctb->as_intra32_analyse[i];
2636
2637
43.4k
                for(j = 0; j < 4; j++)
2638
34.7k
                {
2639
                    /* get the corresponding intra 16 analyse pointer use (blk_cnt & 0xF / 4)*/
2640
                    /* blk cnt is in terms of 8x8 units so a 16x16 will have 4 8x8 units */
2641
34.7k
                    ps_intra16_analyse = &ps_intra32_analyse->as_intra16_analyse[j];
2642
34.7k
                    ps_intra16_analyse->b1_merge_flag = 0;
2643
34.7k
                }
2644
8.68k
            }
2645
2646
            /* CU size 64x64 and fill the final cu params */
2647
            //ps_row_cu->b3_cu_pos_x = gau1_cu_pos_x[0];
2648
            //ps_row_cu->b3_cu_pos_y = gau1_cu_pos_y[0];
2649
            //ps_row_cu->u1_cu_size  = 64;
2650
2651
            /* Candidate mode Update */
2652
2.17k
            cand_mode_list[0] = best_mode;
2653
2.17k
            if(cand_mode_list[0] > 1)
2654
437
            {
2655
437
                if(cand_mode_list[0] == 2)
2656
7
                {
2657
7
                    cand_mode_list[1] = 34;
2658
7
                    cand_mode_list[2] = 3;
2659
7
                }
2660
430
                else if(cand_mode_list[0] == 34)
2661
7
                {
2662
7
                    cand_mode_list[1] = 2;
2663
7
                    cand_mode_list[2] = 33;
2664
7
                }
2665
423
                else
2666
423
                {
2667
423
                    cand_mode_list[1] = cand_mode_list[0] - 1;
2668
423
                    cand_mode_list[2] = cand_mode_list[0] + 1;
2669
423
                }
2670
                //cand_mode_list[1] = ps_ed_blk_l1->nang_attr.best_mode;
2671
                //cand_mode_list[2] = ps_ed_blk_l1->ang_attr.best_mode;
2672
437
            }
2673
1.73k
            else
2674
1.73k
            {
2675
1.73k
                cand_mode_list[0] = 0;
2676
1.73k
                cand_mode_list[1] = 1;
2677
1.73k
                cand_mode_list[2] = 26;
2678
                //cand_mode_list[2] = ps_ed_blk_l1->nang_attr.best_mode;
2679
1.73k
            }
2680
2681
            /* All 32x32 costs are accumalated to 64x64 cost */
2682
2.17k
            ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = 0;
2683
10.8k
            for(i = 0; i < 4; i++)
2684
8.68k
            {
2685
8.68k
                ps_l0_ipe_out_ctb->i4_best64x64_intra_cost +=
2686
8.68k
                    ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i];
2687
8.68k
            }
2688
            /* by default 64x64 modes are set to default values DC and Planar */
2689
2.17k
            ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[0] = cand_mode_list[0];
2690
2.17k
            ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[1] = cand_mode_list[1];
2691
2.17k
            ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[2] = cand_mode_list[2];
2692
2.17k
            ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[3] = 255;
2693
2694
            /* Update CTB mode map for the finalised CU */
2695
2.17k
            x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
2696
2.17k
            y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
2697
2.17k
            size = ps_cu_node->u1_cu_size >> 2;
2698
2699
20.7k
            for(row = y; row < (y + size); row++)
2700
18.5k
            {
2701
186k
                for(col = x; col < (x + size); col++)
2702
167k
                {
2703
167k
                    ps_ctxt->au1_ctb_mode_map[row][col] = best_mode;
2704
167k
                }
2705
18.5k
            }
2706
2707
2.17k
            ihevce_set_nbr_map(
2708
2.17k
                ps_ctxt->pu1_ctb_nbr_map,
2709
2.17k
                ps_ctxt->i4_nbr_map_strd,
2710
2.17k
                (ps_cu_node->u2_x0 << 1),
2711
2.17k
                (ps_cu_node->u2_y0 << 1),
2712
2.17k
                (ps_cu_node->u1_cu_size >> 2),
2713
2.17k
                1);
2714
2715
            /*As 64*64 has won, pick L1 32x32 qp*/
2716
            //ASSERT(((blk_cnt>>6) & 0xF) == (blk_cnt>>6));
2717
            //ASSERT((blk_cnt>>6) == 0);
2718
2.17k
            ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][0] != -2);
2719
2.17k
            i1_cu_possible_qp = ihevce_cu_level_qp_mod(
2720
2.17k
                ps_ctxt->i4_qscale,
2721
2.17k
                ps_ed_ctb_l1->i4_32x32_satd[0][0],
2722
2.17k
                ps_ctxt->ld_curr_frame_32x32_log_avg[0],
2723
2.17k
                f_strength,
2724
2.17k
                &i4_act_factor,
2725
2.17k
                &i4_q_scale_q3_mod,
2726
2.17k
                ps_ctxt->ps_rc_quant_ctxt);
2727
2728
2.17k
            i8_frame_acc_satd_by_modqp_q10 =
2729
2.17k
                (i8_frame_acc_satd_cost << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
2730
2.17k
                i4_q_scale_q3_mod;
2731
            /* Increment pointers */
2732
2.17k
            ps_ed_blk_l1 += 64;
2733
2.17k
            ps_ed_blk_l2 += 16;
2734
            //ps_row_cu++;
2735
2.17k
        }
2736
14.0k
    }
2737
2738
    //ps_ctb_out->u1_num_cus_in_ctb = (UWORD8)(ps_row_cu - ps_curr_cu);
2739
2740
14.0k
    {
2741
14.0k
        WORD32 i4_i, i4_j;
2742
14.0k
        WORD32 dummy;
2743
14.0k
        WORD8 i1_cu_qp;
2744
14.0k
        (void)i1_cu_qp;
2745
        /*MAM_VAR_L1*/
2746
42.1k
        for(i4_j = 0; i4_j < 2; i4_j++)
2747
28.1k
        {
2748
28.1k
            i4_mod_factor_num = ps_ctxt->ai4_mod_factor_derived_by_variance[i4_j];
2749
28.1k
            f_strength = ps_ctxt->f_strength;
2750
2751
            //i4_mod_factor_num = 4;
2752
2753
28.1k
            ps_ed_blk_l1 = ps_ed_l1_ctb;
2754
28.1k
            ps_ed_blk_l2 = ps_ed_l2_ctb;
2755
            //ps_row_cu = ps_curr_cu;
2756
2757
            /*Valid only for complete CTB */
2758
28.1k
            if((64 == u1_curr_ctb_wdt) && (64 == u1_curr_ctb_hgt))
2759
19.7k
            {
2760
19.7k
                ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][0] != -2);
2761
19.7k
                ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][1] != -2);
2762
19.7k
                ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][2] != -2);
2763
19.7k
                ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][3] != -2);
2764
2765
19.7k
                i1_cu_qp = ihevce_cu_level_qp_mod(
2766
19.7k
                    ps_ctxt->i4_qscale,
2767
19.7k
                    ps_ed_ctb_l1->i4_32x32_satd[0][0],
2768
19.7k
                    ps_ctxt->ld_curr_frame_32x32_log_avg[0],
2769
19.7k
                    f_strength,
2770
19.7k
                    &ps_l0_ipe_out_ctb->i4_64x64_act_factor[0][i4_j],
2771
19.7k
                    &dummy,
2772
19.7k
                    ps_ctxt->ps_rc_quant_ctxt);
2773
2774
19.7k
                i1_cu_qp = ihevce_cu_level_qp_mod(
2775
19.7k
                    ps_ctxt->i4_qscale,
2776
19.7k
                    ps_ed_ctb_l1->i4_32x32_satd[0][1],
2777
19.7k
                    ps_ctxt->ld_curr_frame_32x32_log_avg[1],
2778
19.7k
                    f_strength,
2779
19.7k
                    &ps_l0_ipe_out_ctb->i4_64x64_act_factor[1][i4_j],
2780
19.7k
                    &dummy,
2781
19.7k
                    ps_ctxt->ps_rc_quant_ctxt);
2782
19.7k
                i1_cu_qp = ihevce_cu_level_qp_mod(
2783
19.7k
                    ps_ctxt->i4_qscale,
2784
19.7k
                    ps_ed_ctb_l1->i4_32x32_satd[0][2],
2785
19.7k
                    ps_ctxt->ld_curr_frame_32x32_log_avg[2],
2786
19.7k
                    f_strength,
2787
19.7k
                    &ps_l0_ipe_out_ctb->i4_64x64_act_factor[2][i4_j],
2788
19.7k
                    &dummy,
2789
19.7k
                    ps_ctxt->ps_rc_quant_ctxt);
2790
2791
19.7k
                i1_cu_qp = ihevce_cu_level_qp_mod(
2792
19.7k
                    ps_ctxt->i4_qscale,
2793
19.7k
                    ps_ed_ctb_l1->i4_32x32_satd[0][3],
2794
19.7k
                    2.0 + ps_ctxt->ld_curr_frame_16x16_log_avg[0],
2795
19.7k
                    f_strength,
2796
19.7k
                    &ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j],
2797
19.7k
                    &dummy,
2798
19.7k
                    ps_ctxt->ps_rc_quant_ctxt);
2799
2800
19.7k
                ASSERT(ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j] > 0);
2801
19.7k
            }
2802
8.38k
            else
2803
8.38k
            {
2804
8.38k
                ps_l0_ipe_out_ctb->i4_64x64_act_factor[0][i4_j] = 1024;
2805
8.38k
                ps_l0_ipe_out_ctb->i4_64x64_act_factor[1][i4_j] = 1024;
2806
8.38k
                ps_l0_ipe_out_ctb->i4_64x64_act_factor[2][i4_j] = 1024;
2807
8.38k
                ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j] = 1024;
2808
8.38k
            }
2809
2810
            /*Store the 8x8 Qps from L2 (in raster order) as output of intra prediction
2811
            for the usage by ME*/
2812
2813
28.1k
            {
2814
28.1k
                WORD32 pos_x_32, pos_y_32, pos;
2815
                //WORD32 i4_incomplete_ctb_val_8;
2816
28.1k
                pos_x_32 = u1_curr_ctb_wdt / 16;
2817
28.1k
                pos_y_32 = u1_curr_ctb_hgt / 16;
2818
2819
28.1k
                pos = (pos_x_32 < pos_y_32) ? pos_x_32 : pos_y_32;
2820
2821
140k
                for(i4_i = 0; i4_i < 4; i4_i++)
2822
112k
                {
2823
112k
                    if(i4_i < pos)
2824
83.9k
                    {
2825
83.9k
                        ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] != -2);
2826
83.9k
                        ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] != -2);
2827
83.9k
                        ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] != -2);
2828
83.9k
                        i1_cu_qp = ihevce_cu_level_qp_mod(
2829
83.9k
                            ps_ctxt->i4_qscale,
2830
83.9k
                            ps_ed_ctb_l1->i4_16x16_satd[i4_i][0],
2831
83.9k
                            ps_ctxt->ld_curr_frame_16x16_log_avg[0],
2832
83.9k
                            f_strength,
2833
83.9k
                            &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][0][i4_j],
2834
83.9k
                            &dummy,
2835
83.9k
                            ps_ctxt->ps_rc_quant_ctxt);
2836
83.9k
                        i1_cu_qp = ihevce_cu_level_qp_mod(
2837
83.9k
                            ps_ctxt->i4_qscale,
2838
83.9k
                            ps_ed_ctb_l1->i4_16x16_satd[i4_i][1],
2839
83.9k
                            ps_ctxt->ld_curr_frame_16x16_log_avg[1],
2840
83.9k
                            f_strength,
2841
83.9k
                            &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][1][i4_j],
2842
83.9k
                            &dummy,
2843
83.9k
                            ps_ctxt->ps_rc_quant_ctxt);
2844
83.9k
                        i1_cu_qp = ihevce_cu_level_qp_mod(
2845
83.9k
                            ps_ctxt->i4_qscale,
2846
83.9k
                            ps_ed_ctb_l1->i4_16x16_satd[i4_i][2],
2847
83.9k
                            ps_ctxt->ld_curr_frame_16x16_log_avg[2],
2848
83.9k
                            f_strength,
2849
83.9k
                            &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][2][i4_j],
2850
83.9k
                            &dummy,
2851
83.9k
                            ps_ctxt->ps_rc_quant_ctxt);
2852
83.9k
                    }
2853
28.5k
                    else
2854
28.5k
                    {
2855
                        /*For incomplete CTB */
2856
28.5k
                        ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][0][i4_j] = 1024;
2857
28.5k
                        ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][1][i4_j] = 1024;
2858
28.5k
                        ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][2][i4_j] = 1024;
2859
28.5k
                    }
2860
112k
                }
2861
28.1k
            }
2862
2863
            /*Store the 8x8 Qps from L1 (in raster order) as output of intra prediction
2864
            for the usage by ME*/
2865
28.1k
            {
2866
28.1k
                WORD32 pos_x_16, pos_y_16, pos;
2867
                //WORD32 i4_incomplete_ctb_val_8;
2868
28.1k
                pos_x_16 = u1_curr_ctb_wdt / 4;
2869
28.1k
                pos_y_16 = u1_curr_ctb_hgt / 4;
2870
2871
28.1k
                pos = (pos_x_16 < pos_y_16) ? pos_x_16 : pos_y_16;
2872
478k
                for(i4_i = 0; i4_i < 16; i4_i++)
2873
449k
                {
2874
449k
                    if(i4_i < pos)
2875
348k
                    {
2876
348k
                        ASSERT(ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] != -2);
2877
348k
                        ASSERT(ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] != -2);
2878
348k
                        i1_cu_qp = ihevce_cu_level_qp_mod(
2879
348k
                            ps_ctxt->i4_qscale,
2880
348k
                            ps_ed_ctb_l1->i4_8x8_satd[i4_i][0],
2881
348k
                            ps_ctxt->ld_curr_frame_8x8_log_avg[0],
2882
348k
                            f_strength,
2883
348k
                            &ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][0][i4_j],
2884
348k
                            &dummy,
2885
348k
                            ps_ctxt->ps_rc_quant_ctxt);
2886
348k
                        i1_cu_qp = ihevce_cu_level_qp_mod(
2887
348k
                            ps_ctxt->i4_qscale,
2888
348k
                            ps_ed_ctb_l1->i4_8x8_satd[i4_i][1],
2889
348k
                            ps_ctxt->ld_curr_frame_8x8_log_avg[1],
2890
348k
                            f_strength,
2891
348k
                            &ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][1][i4_j],
2892
348k
                            &dummy,
2893
348k
                            ps_ctxt->ps_rc_quant_ctxt);
2894
348k
                    }
2895
101k
                    else
2896
101k
                    {
2897
                        /*For incomplete CTB */
2898
101k
                        ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][0][i4_j] = 1024;
2899
101k
                        ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][1][i4_j] = 1024;
2900
101k
                    }
2901
449k
                }
2902
28.1k
            }
2903
28.1k
        }  //for loop
2904
2905
        /* Accumalate the cost of ctb to the total cost */
2906
14.0k
        ps_ctxt->i8_frame_acc_satd_cost += i8_frame_acc_satd_cost;
2907
14.0k
        ps_ctxt->i8_frame_acc_satd_by_modqp_q10 += i8_frame_acc_satd_by_modqp_q10;
2908
2909
14.0k
        ps_ctxt->i8_frame_acc_mode_bits_cost += i8_frame_acc_mode_bits_cost;
2910
2911
        /* satd and mpm bits accumalation of best cu size candiate for the ctb */
2912
14.0k
        ps_l0_ipe_out_ctb->i4_ctb_acc_satd = i4_ctb_acc_satd;
2913
14.0k
        ps_l0_ipe_out_ctb->i4_ctb_acc_mpm_bits = i8_frame_acc_mode_bits_cost;
2914
2915
14.0k
        ps_ctxt->i8_frame_acc_satd += i4_ctb_acc_satd;
2916
14.0k
    }
2917
2918
14.0k
    {
2919
14.0k
        WORD32 ctr_8x8;
2920
239k
        for(ctr_8x8 = 0; ctr_8x8 < (MAX_CU_IN_CTB >> 2); ctr_8x8++)
2921
224k
        {
2922
            /*Accumalate activity factor for Intra and Inter*/
2923
224k
            if(ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_ipe[ctr_8x8] <
2924
224k
               ps_ed_ctb_l1->i4_sad_me_for_ref[ctr_8x8])
2925
35.7k
            {
2926
35.7k
                ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8] =
2927
35.7k
                    ps_l0_ipe_out_ctb->i4_16x16_act_factor[ctr_8x8][1][0];
2928
35.7k
            }
2929
189k
            else
2930
189k
            {
2931
189k
                ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8] =
2932
189k
                    ps_l0_ipe_out_ctb->i4_16x16_act_factor[ctr_8x8][1][0];
2933
189k
            }
2934
2935
            /*Accumalate activity factor at frame level*/
2936
224k
            ps_ctxt->i8_frame_acc_act_factor += ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8];
2937
224k
        }
2938
14.0k
    }
2939
14.0k
    return;
2940
14.0k
}
2941
2942
WORD32 ihevce_nxn_sad_computer(
2943
    UWORD8 *pu1_inp, WORD32 i4_inp_stride, UWORD8 *pu1_ref, WORD32 i4_ref_stride, WORD32 trans_size)
2944
292k
{
2945
292k
    WORD32 wd, ht, i, j;
2946
292k
    WORD32 sad = 0;
2947
2948
292k
    wd = trans_size;
2949
292k
    ht = trans_size;
2950
2951
3.25M
    for(i = 0; i < ht; i++)
2952
2.96M
    {
2953
39.6M
        for(j = 0; j < wd; j++)
2954
36.6M
        {
2955
36.6M
            sad += (ABS(((WORD32)pu1_inp[j] - (WORD32)pu1_ref[j])));
2956
36.6M
        }
2957
2.96M
        pu1_inp += i4_inp_stride;
2958
2.96M
        pu1_ref += i4_ref_stride;
2959
2.96M
    }
2960
2961
292k
    return sad;
2962
292k
}
2963
2964
/*!
2965
******************************************************************************
2966
* \if Function name : ihevce_mode_eval_filtering \endif
2967
*
2968
* \brief
2969
*    Evaluates best 3 modes for the given CU size with probable modes from,
2970
*    early decision structure, mpm candidates and dc, planar mode
2971
*
2972
* \param[in] ps_cu_node : pointer to MAX cu node info buffer
2973
* \param[in] ps_child_cu_node : pointer to (MAX - 1) cu node info buffer
2974
* \param[in] ps_ctxt : pointer to IPE context struct
2975
* \param[in] ps_curr_src : pointer to src pixels struct
2976
* \param[in] best_amode : best angular mode from l1 layer or
2977
                            from (MAX - 1) CU mode
2978
* \param[in] best_costs_4x4  : pointer to 3 best cost buffer
2979
* \param[in] best_modes_4x4  : pointer to 3 best mode buffer
2980
* \param[in] step2_bypass : if 0, (MAX - 1) CU is evaluated
2981
*                           if 1, (MAX CU) sugested is evaluated
2982
* \param[in] tu_eq_cu     : indicates if tu size is same as cu or cu/2
2983
*
2984
* \return
2985
*    None
2986
*
2987
* \author
2988
*  Ittiam
2989
*
2990
*****************************************************************************
2991
*/
2992
void ihevce_mode_eval_filtering(
2993
    ihevce_ipe_cu_tree_t *ps_cu_node,
2994
    ihevce_ipe_cu_tree_t *ps_child_cu_node,
2995
    ihevce_ipe_ctxt_t *ps_ctxt,
2996
    iv_enc_yuv_buf_t *ps_curr_src,
2997
    WORD32 best_amode,
2998
    WORD32 *best_costs_4x4,
2999
    UWORD8 *best_modes_4x4,
3000
    WORD32 step2_bypass,
3001
    WORD32 tu_eq_cu)
3002
697k
{
3003
697k
    UWORD8 *pu1_origin, *pu1_orig;
3004
697k
    WORD32 src_strd = ps_curr_src->i4_y_strd;
3005
697k
    WORD32 nbr_flags;
3006
697k
    nbr_avail_flags_t s_nbr;
3007
697k
    WORD32 trans_size = tu_eq_cu ? ps_cu_node->u1_cu_size : ps_cu_node->u1_cu_size >> 1;
3008
697k
    WORD32 num_tu_in_x = tu_eq_cu ? 1 : 2;
3009
697k
    WORD32 num_tu_in_y = tu_eq_cu ? 1 : 2;
3010
697k
    UWORD8 mode;
3011
3012
697k
    WORD32 cost_ang_mode = MAX_INTRA_COST_IPE;
3013
697k
    WORD32 filter_flag;
3014
697k
    WORD32 cost_amode_step2[7] = { 0 };
3015
    /*WORD32 best_sad[5];  // NOTE_A01: Not getting consumed at present */
3016
697k
    WORD32 sad = 0;
3017
697k
    WORD32 cu_pos_x, cu_pos_y;
3018
697k
    WORD32 temp;
3019
697k
    WORD32 i = 0, j, k, i_end, z;
3020
    //WORD32 row, col, size;
3021
697k
    UWORD8 *pu1_ref;
3022
697k
    WORD32 xA, yA, xB, yB;
3023
697k
    WORD32 top_intra_mode;
3024
697k
    WORD32 left_intra_mode;
3025
697k
    UWORD8 *pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3026
697k
    UWORD8 *pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3027
3028
697k
    UWORD8 modes_4x4[5] = { 0, 1, 2, 3, 4 };
3029
697k
    WORD32 count;
3030
3031
697k
    pf_ipe_res_trans_had apf_resd_trns_had[4];
3032
3033
697k
    WORD32 cand_mode_satd_list[3];
3034
697k
    ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
3035
3036
697k
    ihevc_intra_pred_luma_ref_substitution_fptr =
3037
697k
        ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
3038
3039
697k
    apf_resd_trns_had[0] = ps_ctxt->s_cmn_opt_func.pf_HAD_4x4_8bit;
3040
697k
    apf_resd_trns_had[1] = ps_ctxt->s_cmn_opt_func.pf_HAD_8x8_8bit;
3041
697k
    apf_resd_trns_had[2] = ps_ctxt->s_cmn_opt_func.pf_HAD_16x16_8bit;
3042
697k
    apf_resd_trns_had[3] = ps_ctxt->s_cmn_opt_func.pf_HAD_32x32_8bit;
3043
3044
    /* initialize modes_to_eval as zero */
3045
697k
    memset(&ps_ctxt->au1_modes_to_eval, 0, MAX_NUM_IP_MODES);
3046
3047
    /* Compute the Parent Cost */
3048
3049
    /* Pointer to top-left of the CU - y0,x0 in 8x8 granularity */
3050
697k
    pu1_orig = (UWORD8 *)(ps_curr_src->pv_y_buf) + ((ps_cu_node->u2_y0 << 3) * src_strd) +
3051
697k
               (ps_cu_node->u2_x0 << 3);
3052
3053
    /* Get position of CU within CTB at 4x4 granularity */
3054
697k
    cu_pos_x = ps_cu_node->u2_x0 << 1;
3055
697k
    cu_pos_y = ps_cu_node->u2_y0 << 1;
3056
3057
    /* get the neighbour availability flags */
3058
697k
    ihevce_get_only_nbr_flag(
3059
697k
        &s_nbr,
3060
697k
        ps_ctxt->pu1_ctb_nbr_map,
3061
697k
        ps_ctxt->i4_nbr_map_strd,
3062
697k
        cu_pos_x,
3063
697k
        cu_pos_y,
3064
697k
        trans_size >> 2,
3065
697k
        trans_size >> 2);
3066
3067
    /* Traverse for all 4 child blocks in the parent block */
3068
697k
    xA = (ps_cu_node->u2_x0 << 3) >> 2;
3069
697k
    yA = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
3070
697k
    xB = xA + 1;
3071
697k
    yB = yA - 1;
3072
697k
    left_intra_mode = ps_ctxt->au1_ctb_mode_map[yA][xA];
3073
697k
    top_intra_mode = ps_ctxt->au1_ctb_mode_map[yB][xB];
3074
    /* call the function which populates sad cost for all the modes */
3075
3076
697k
    ihevce_intra_populate_mode_bits_cost_bracketing(
3077
697k
        top_intra_mode,
3078
697k
        left_intra_mode,
3079
697k
        s_nbr.u1_top_avail,
3080
697k
        s_nbr.u1_left_avail,
3081
697k
        ps_cu_node->u2_y0,
3082
697k
        &ps_ctxt->au2_mode_bits_satd_cost[0],
3083
697k
        &ps_ctxt->au2_mode_bits_satd[0],
3084
697k
        ps_ctxt->i4_ol_satd_lambda,
3085
697k
        cand_mode_satd_list);
3086
3087
1.61M
    for(k = 0; k < num_tu_in_y; k++)
3088
919k
    {
3089
2.28M
        for(j = 0; j < num_tu_in_x; j++)
3090
1.36M
        {
3091
            /* get the neighbour availability flags */
3092
1.36M
            nbr_flags = ihevce_get_nbr_intra(
3093
1.36M
                &s_nbr,
3094
1.36M
                ps_ctxt->pu1_ctb_nbr_map,
3095
1.36M
                ps_ctxt->i4_nbr_map_strd,
3096
1.36M
                cu_pos_x + ((j) * (trans_size >> 2)),
3097
1.36M
                cu_pos_y + ((k) * (trans_size >> 2)),
3098
1.36M
                trans_size >> 2);
3099
3100
1.36M
            pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3101
3102
            /* Create reference samples array */
3103
1.36M
            ihevc_intra_pred_luma_ref_substitution_fptr(
3104
1.36M
                pu1_origin - src_strd - 1,
3105
1.36M
                pu1_origin - src_strd,
3106
1.36M
                pu1_origin - 1,
3107
1.36M
                src_strd,
3108
1.36M
                trans_size,
3109
1.36M
                nbr_flags,
3110
1.36M
                pu1_ref_orig,
3111
1.36M
                0);
3112
3113
            /* Perform reference samples filtering */
3114
1.36M
            ihevce_intra_pred_ref_filtering(pu1_ref_orig, trans_size, pu1_ref_filt);
3115
3116
1.36M
            ihevce_set_nbr_map(
3117
1.36M
                ps_ctxt->pu1_ctb_nbr_map,
3118
1.36M
                ps_ctxt->i4_nbr_map_strd,
3119
1.36M
                cu_pos_x + ((j) * (trans_size >> 2)),
3120
1.36M
                cu_pos_y + ((k) * (trans_size >> 2)),
3121
1.36M
                (trans_size >> 2),
3122
1.36M
                1);
3123
3124
1.36M
            pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3125
1.36M
            pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3126
1.36M
        }
3127
919k
    }
3128
3129
    /* Revaluation for angular mode */
3130
    //if(ps_ed_blk->ang_attr.mode_present == 1)
3131
    //if(((best_amode & 0x1) != 1))
3132
3133
697k
    {
3134
697k
        WORD32 u1_trans_idx = trans_size >> 3;
3135
697k
        if(trans_size == 32)
3136
20.6k
            u1_trans_idx = 3;
3137
        //best_amode = ps_ed_blk->ang_attr.best_mode;
3138
3139
697k
        i = 0;
3140
697k
        if(!step2_bypass)
3141
329k
        {
3142
            /* Around best level 4 angular mode, search for best level 2 mode */
3143
329k
            ASSERT((best_amode >= 2) && (best_amode <= 34));
3144
3145
329k
            if(ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P3)
3146
235k
            {
3147
235k
                if(best_amode >= 4)
3148
229k
                    ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode - 2;
3149
235k
            }
3150
3151
329k
            ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode;
3152
3153
329k
            if(ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P3)
3154
235k
            {
3155
235k
                if(best_amode <= 32)
3156
232k
                    ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode + 2;
3157
235k
            }
3158
329k
        }
3159
367k
        else
3160
367k
        {
3161
367k
            ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[0]->best_mode;
3162
367k
            ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[1]->best_mode;
3163
367k
            ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[2]->best_mode;
3164
367k
            ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[3]->best_mode;
3165
367k
        }
3166
3167
        /* Add the left and top MPM modes for computation*/
3168
3169
697k
        ps_ctxt->au1_modes_to_eval_temp[i++] = cand_mode_satd_list[0];
3170
697k
        ps_ctxt->au1_modes_to_eval_temp[i++] = cand_mode_satd_list[1];
3171
3172
697k
        i_end = i;
3173
697k
        count = 0;
3174
3175
        /*Remove duplicate modes from modes_to_eval_temp[] */
3176
4.35M
        for(j = 0; j < i_end; j++)
3177
3.65M
        {
3178
7.71M
            for(k = 0; k < count; k++)
3179
4.64M
            {
3180
4.64M
                if(ps_ctxt->au1_modes_to_eval_temp[j] == ps_ctxt->au1_modes_to_eval[k])
3181
593k
                    break;
3182
4.64M
            }
3183
3.65M
            if((k == count) && (ps_ctxt->au1_modes_to_eval_temp[j] > 1))
3184
1.82M
            {
3185
1.82M
                ps_ctxt->au1_modes_to_eval[count] = ps_ctxt->au1_modes_to_eval_temp[j];
3186
1.82M
                count++;
3187
1.82M
            }
3188
3.65M
        }
3189
697k
        i_end = count;
3190
697k
        if(count == 0)
3191
45.9k
        {
3192
45.9k
            ps_ctxt->au1_modes_to_eval[0] = 26;
3193
45.9k
            i_end = 1;
3194
45.9k
        }
3195
3196
2.56M
        for(i = 0; i < i_end; i++)
3197
1.87M
        {
3198
1.87M
            pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3199
1.87M
            pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3200
3201
1.87M
            mode = ps_ctxt->au1_modes_to_eval[i];
3202
1.87M
            ASSERT((mode >= 2) && (mode <= 34));
3203
1.87M
            cost_amode_step2[i] = ps_ctxt->au2_mode_bits_satd_cost[mode];
3204
1.87M
            filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(trans_size) - 2));
3205
3206
4.40M
            for(k = 0; k < num_tu_in_y; k++)
3207
2.53M
            {
3208
6.40M
                for(j = 0; j < num_tu_in_x; j++)
3209
3.86M
                {
3210
3.86M
                    pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3211
3212
3.86M
                    if(0 == filter_flag)
3213
3.64M
                        pu1_ref = pu1_ref_orig;
3214
219k
                    else
3215
219k
                        pu1_ref = pu1_ref_filt;
3216
3217
3.86M
                    g_apf_lum_ip[g_i4_ip_funcs[mode]](
3218
3.86M
                        pu1_ref, 0, &ps_ctxt->au1_pred_samples[0], trans_size, trans_size, mode);
3219
3220
3.86M
                    if(ps_ctxt->u1_use_satd)
3221
3.57M
                    {
3222
3.57M
                        sad = apf_resd_trns_had[u1_trans_idx](
3223
3.57M
                            pu1_origin,
3224
3.57M
                            ps_curr_src->i4_y_strd,
3225
3.57M
                            &ps_ctxt->au1_pred_samples[0],
3226
3.57M
                            trans_size,
3227
3.57M
                            NULL,
3228
3.57M
                            0
3229
3230
3.57M
                        );
3231
3.57M
                    }
3232
292k
                    else
3233
292k
                    {
3234
292k
                        sad = ps_ctxt->s_ipe_optimised_function_list.pf_nxn_sad_computer(
3235
292k
                            pu1_origin,
3236
292k
                            ps_curr_src->i4_y_strd,
3237
292k
                            &ps_ctxt->au1_pred_samples[0],
3238
292k
                            trans_size,
3239
292k
                            trans_size);
3240
292k
                    }
3241
3242
3.86M
                    cost_amode_step2[i] += sad;
3243
3244
3.86M
                    pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3245
3.86M
                    pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3246
3.86M
                }
3247
2.53M
            }
3248
1.87M
        }
3249
697k
        best_amode = ps_ctxt->au1_modes_to_eval[0];
3250
        /*Init cost indx */
3251
697k
        cost_ang_mode = MAX_INTRA_COST_IPE;  //cost_amode_step2[0];
3252
2.56M
        for(z = 0; z < i_end; z++)
3253
1.87M
        {
3254
            /* Least cost of all 3 angles are stored in cost_amode_step2[0] and corr. mode*/
3255
1.87M
            if(cost_ang_mode >= cost_amode_step2[z])
3256
1.19M
            {
3257
1.19M
                if(cost_ang_mode == cost_amode_step2[z])
3258
84.6k
                {
3259
84.6k
                    if(best_amode > ps_ctxt->au1_modes_to_eval[z])
3260
3.52k
                        best_amode = ps_ctxt->au1_modes_to_eval[z];
3261
84.6k
                }
3262
1.10M
                else
3263
1.10M
                {
3264
1.10M
                    best_amode = ps_ctxt->au1_modes_to_eval[z];
3265
1.10M
                }
3266
1.19M
                cost_ang_mode = cost_amode_step2[z];
3267
1.19M
            }
3268
1.87M
        }
3269
3270
        /*Modify mode bits for the angular modes */
3271
697k
    }
3272
3273
697k
    {
3274
        /* Step - I modification */
3275
697k
        ASSERT((best_amode >= 2) && (best_amode <= 34));
3276
697k
        i_end = 0;
3277
697k
        z = 0;
3278
3279
        /* Around best level 3 angular mode, search for best level 1 mode */
3280
697k
        ps_ctxt->au1_modes_to_eval[i_end++] = 0;
3281
697k
        ps_ctxt->au1_modes_to_eval[i_end++] = 1;
3282
3283
697k
        if(best_amode != 2)
3284
668k
            ps_ctxt->au1_modes_to_eval[i_end++] = best_amode - 1;
3285
3286
697k
        ps_ctxt->au1_modes_to_eval[i_end++] = best_amode;
3287
3288
697k
        if(best_amode != 34)
3289
688k
            ps_ctxt->au1_modes_to_eval[i_end++] = best_amode + 1;
3290
3291
        /* Inserting step_2's best mode at last to avoid
3292
        recalculation of it's SATD cost */
3293
3294
        //ps_ctxt->au1_modes_to_eval[i_end] = best_amode; //Bugfix: HSAD compared with SAD
3295
        //cost_amode_step2[i_end] = cost_ang_mode;
3296
3297
        /*best_sad[i_end] = cost_ang_mode
3298
                - mode_bits_satd_cost[best_amode]; //See NOTE_A01 above */
3299
3300
697k
        cost_ang_mode = MAX_INTRA_COST_IPE; /* Init cost */
3301
3302
4.14M
        for(i = 0; i < i_end; i++)
3303
3.44M
        {
3304
3.44M
            WORD32 u1_trans_idx = trans_size >> 3;
3305
3.44M
            if(trans_size == 32)
3306
102k
                u1_trans_idx = 3;
3307
3.44M
            pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
3308
3.44M
            pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
3309
3310
            /*best_sad[i] = 0; //See NOTE_A01 above */
3311
3.44M
            mode = ps_ctxt->au1_modes_to_eval[i];
3312
3.44M
            cost_amode_step2[i] = ps_ctxt->au2_mode_bits_satd_cost[mode];
3313
3.44M
            filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(trans_size) - 2));
3314
3315
8.00M
            for(k = 0; k < num_tu_in_y; k++)
3316
4.55M
            {
3317
11.3M
                for(j = 0; j < num_tu_in_x; j++)
3318
6.75M
                {
3319
6.75M
                    pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
3320
3321
6.75M
                    if(0 == filter_flag)
3322
5.85M
                        pu1_ref = pu1_ref_orig;
3323
906k
                    else
3324
906k
                        pu1_ref = pu1_ref_filt;
3325
3326
6.75M
                    g_apf_lum_ip[g_i4_ip_funcs[mode]](
3327
6.75M
                        pu1_ref, 0, &ps_ctxt->au1_pred_samples[0], trans_size, trans_size, mode);
3328
3329
                    //if(trans_size != 4)
3330
6.75M
                    {
3331
6.75M
                        sad = apf_resd_trns_had[u1_trans_idx](
3332
6.75M
                            pu1_origin,
3333
6.75M
                            ps_curr_src->i4_y_strd,
3334
6.75M
                            &ps_ctxt->au1_pred_samples[0],
3335
6.75M
                            trans_size,
3336
6.75M
                            NULL,
3337
6.75M
                            0);
3338
6.75M
                    }
3339
3340
                    /*accumualting SATD though name says it is sad*/
3341
6.75M
                    cost_amode_step2[i] += sad;
3342
                    /*best_sad[i] +=sad; //See NOTE_A01 above */
3343
6.75M
                    pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
3344
6.75M
                    pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
3345
6.75M
                }
3346
4.55M
            }
3347
3.44M
        }
3348
        /* Updating i_end for the step_2's inserted mode*/
3349
        //        i_end++;
3350
3351
        /* Arrange the reference array in ascending order */
3352
3353
3.44M
        for(i = 0; i < (i_end - 1); i++)
3354
2.75M
        {
3355
9.57M
            for(j = i + 1; j < i_end; j++)
3356
6.82M
            {
3357
6.82M
                if(cost_amode_step2[i] > cost_amode_step2[j])
3358
2.66M
                {
3359
2.66M
                    temp = cost_amode_step2[i];
3360
2.66M
                    cost_amode_step2[i] = cost_amode_step2[j];
3361
2.66M
                    cost_amode_step2[j] = temp;
3362
3363
2.66M
                    temp = modes_4x4[i];
3364
2.66M
                    modes_4x4[i] = modes_4x4[j];
3365
2.66M
                    modes_4x4[j] = temp;
3366
2.66M
                }
3367
6.82M
            }
3368
2.75M
        }
3369
3370
        /* Least cost of all 3 angles are stored in cost_amode_step2[0] and corr. mode*/
3371
697k
        best_amode = ps_ctxt->au1_modes_to_eval[modes_4x4[0]];
3372
697k
        cost_ang_mode = cost_amode_step2[0];
3373
697k
        ps_cu_node->best_satd = cost_ang_mode - ps_ctxt->au2_mode_bits_satd_cost[best_amode];
3374
697k
        ps_cu_node->best_cost = cost_amode_step2[0];
3375
697k
        ps_cu_node->best_mode = ps_ctxt->au1_modes_to_eval[modes_4x4[0]];
3376
697k
        ps_cu_node->best_satd =
3377
697k
            ps_cu_node->best_cost - ps_ctxt->au2_mode_bits_satd_cost[ps_cu_node->best_mode];
3378
3379
        /*Accumalate best mode bits cost for RC*/
3380
697k
        ps_cu_node->u2_mode_bits_cost = ps_ctxt->au2_mode_bits_satd[ps_cu_node->best_mode];
3381
3382
        /* Store the best three candidates */
3383
2.78M
        for(i = 0; i < 3; i++)
3384
2.09M
        {
3385
2.09M
            best_costs_4x4[i] = cost_amode_step2[i];
3386
2.09M
            best_modes_4x4[i] = ps_ctxt->au1_modes_to_eval[modes_4x4[i]];
3387
2.09M
        }
3388
697k
    }
3389
3390
697k
    return;
3391
697k
}