Coverage Report

Created: 2025-11-05 07:08

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/encoder/ihevce_decomp_pre_intra_pass.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/*!
22
******************************************************************************
23
* \file ihevce_decomp_pre_intra_pass.c
24
*
25
* \brief
26
*    This file contains definitions related to frame decomposition done during
27
*    pre intra processing
28
*
29
* \date
30
*    19/02/2013
31
*
32
* \author
33
*    Ittiam
34
*
35
* List of Functions
36
*    ihevce_intra_populate_mode_bits_cost()
37
*    ihevce_8x8_sad_computer()
38
*    ihevce_4x4_sad_computer()
39
*    ihevce_ed_4x4_find_best_modes()
40
*    ihevce_ed_calc_4x4_blk()
41
*    ihevce_ed_calc_8x8_blk()
42
*    ihevce_ed_calc_incomplete_ctb()
43
*    ihevce_cu_level_qp_mod()
44
*    ihevce_ed_calc_ctb()
45
*    ihevce_ed_frame_init()
46
*    ihevce_scale_by_2()
47
*    ihevce_decomp_pre_intra_process_row()
48
*    ihevce_decomp_pre_intra_process()
49
*    ihevce_decomp_pre_intra_get_num_mem_recs()
50
*    ihevce_decomp_pre_intra_get_mem_recs()
51
*    ihevce_decomp_pre_intra_init()
52
*    ihevce_decomp_pre_intra_frame_init()
53
*    ihevce_merge_sort()
54
*    ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit()
55
*
56
******************************************************************************
57
*/
58
59
/*****************************************************************************/
60
/* File Includes                                                             */
61
/*****************************************************************************/
62
/* System include files */
63
#include <stdio.h>
64
#include <string.h>
65
#include <stdlib.h>
66
#include <assert.h>
67
#include <stdarg.h>
68
#include <stdint.h>
69
#include <math.h>
70
#include <limits.h>
71
72
/* User include files */
73
#include "ihevc_typedefs.h"
74
#include "itt_video_api.h"
75
#include "ihevce_api.h"
76
77
#include "rc_cntrl_param.h"
78
#include "rc_frame_info_collector.h"
79
#include "rc_look_ahead_params.h"
80
81
#include "ihevc_defs.h"
82
#include "ihevc_debug.h"
83
#include "ihevc_structs.h"
84
#include "ihevc_platform_macros.h"
85
#include "ihevc_deblk.h"
86
#include "ihevc_itrans_recon.h"
87
#include "ihevc_chroma_itrans_recon.h"
88
#include "ihevc_chroma_intra_pred.h"
89
#include "ihevc_intra_pred.h"
90
#include "ihevc_inter_pred.h"
91
#include "ihevc_mem_fns.h"
92
#include "ihevc_padding.h"
93
#include "ihevc_weighted_pred.h"
94
#include "ihevc_sao.h"
95
#include "ihevc_resi_trans.h"
96
#include "ihevc_quant_iquant_ssd.h"
97
#include "ihevc_cabac_tables.h"
98
99
#include "ihevce_defs.h"
100
#include "ihevce_hle_interface.h"
101
#include "ihevce_lap_enc_structs.h"
102
#include "ihevce_multi_thrd_structs.h"
103
#include "ihevce_multi_thrd_funcs.h"
104
#include "ihevce_me_common_defs.h"
105
#include "ihevce_had_satd.h"
106
#include "ihevce_error_codes.h"
107
#include "ihevce_bitstream.h"
108
#include "ihevce_cabac.h"
109
#include "ihevce_rdoq_macros.h"
110
#include "ihevce_function_selector.h"
111
#include "ihevce_enc_structs.h"
112
#include "ihevce_entropy_structs.h"
113
#include "ihevce_cmn_utils_instr_set_router.h"
114
#include "ihevce_ipe_instr_set_router.h"
115
#include "ihevce_decomp_pre_intra_structs.h"
116
#include "ihevce_decomp_pre_intra_pass.h"
117
#include "ihevce_enc_loop_structs.h"
118
#include "hme_datatype.h"
119
#include "hme_interface.h"
120
#include "hme_common_defs.h"
121
#include "ihevce_global_tables.h"
122
123
/*****************************************************************************/
124
/* Global variables                                                          */
125
/*****************************************************************************/
126
127
/**
128
*****************************************************************************
129
* @brief subset of intra modes to be evaluated during pre enc intra process
130
*****************************************************************************
131
*/
132
static const UWORD8 gau1_modes_to_eval[11] = { 0, 1, 26, 2, 6, 10, 14, 18, 22, 30, 34 };
133
134
/**
135
*****************************************************************************
136
* @brief  list of pointers to luma intra pred functions
137
*****************************************************************************
138
*/
139
pf_intra_pred g_apf_lum_ip[NUM_IP_FUNCS];
140
141
/*****************************************************************************/
142
/* Function Definitions                                                      */
143
/*****************************************************************************/
144
145
/*!
146
******************************************************************************
147
* \if Function name : ihevce_intra_populate_mode_bits_cost \endif
148
*
149
* \brief: look-up table of cost of signalling an intra mode in the
150
*  bitstream
151
*
152
*****************************************************************************
153
*/
154
static void ihevce_intra_populate_mode_bits_cost(UWORD16 *mode_bits_cost, WORD32 lambda)
155
14.7M
{
156
14.7M
    WORD32 i;
157
    // 5.5 * lambda
158
14.7M
    UWORD16 five_bits_cost = COMPUTE_RATE_COST_CLIP30(11, lambda, (LAMBDA_Q_SHIFT + 1));
159
160
529M
    for(i = 0; i < NUM_MODES; i++)
161
514M
    {
162
514M
        mode_bits_cost[i] = five_bits_cost;
163
514M
    }
164
14.7M
}
165
166
/*!
167
******************************************************************************
168
* \if Function name : ihevce_8x8_sad_computer \endif
169
*
170
* \brief: compute sad between 2 8x8 blocks
171
*
172
*****************************************************************************
173
*/
174
UWORD16 ihevce_8x8_sad_computer(UWORD8 *src, UWORD8 *pred, WORD32 src_strd, WORD32 pred_strd)
175
3.36M
{
176
3.36M
    UWORD16 sad = 0;
177
3.36M
    WORD32 i, j;
178
179
30.2M
    for(i = 0; i < 8; i++)
180
26.8M
    {
181
241M
        for(j = 0; j < 8; j++)
182
215M
        {
183
215M
            sad += ABS(src[j] - pred[j]);
184
215M
        }
185
26.8M
        src += src_strd;
186
26.8M
        pred += pred_strd;
187
26.8M
    }
188
189
3.36M
    return sad;
190
3.36M
}
191
192
/*!
193
******************************************************************************
194
* \if Function name : ihevce_4x4_sad_computer \endif
195
*
196
* \brief: compute sad between 2 4x4 blocks
197
*
198
*****************************************************************************
199
*/
200
UWORD16 ihevce_4x4_sad_computer(UWORD8 *src, UWORD8 *pred, WORD32 src_strd, WORD32 pred_strd)
201
235M
{
202
235M
    UWORD16 sad = 0;
203
235M
    WORD32 i, j;
204
205
1.17G
    for(i = 0; i < 4; i++)
206
942M
    {
207
4.71G
        for(j = 0; j < 4; j++)
208
3.76G
        {
209
3.76G
            sad += ABS(src[j] - pred[j]);
210
3.76G
        }
211
942M
        src += src_strd;
212
942M
        pred += pred_strd;
213
942M
    }
214
215
235M
    return sad;
216
235M
}
217
218
/*!
219
******************************************************************************
220
* \if Function name : ihevce_ed_4x4_find_best_modes \endif
221
*
222
* \brief: evaluate input 4x4 block for pre-selected list intra modes and
223
* return best sad, cost
224
*
225
*****************************************************************************
226
*/
227
void ihevce_ed_4x4_find_best_modes(
228
    UWORD8 *pu1_src,
229
    WORD32 src_stride,
230
    UWORD8 *ref,
231
    UWORD16 *mode_bits_cost,
232
    UWORD8 *pu1_best_modes,
233
    WORD32 *pu1_best_sad_costs,
234
    WORD32 u1_low_resol,
235
    FT_SAD_COMPUTER *pf_4x4_sad_computer)
236
16.8M
{
237
16.8M
    WORD32 i;
238
16.8M
    UWORD8 mode = 0, best_amode = 0, best_nmode = 0;
239
16.8M
    UWORD8 pred[16];
240
16.8M
    WORD32 sad = 0;
241
16.8M
    WORD32 sad_cost = 0;
242
16.8M
    WORD32 best_asad_cost = 0xFFFFF;
243
16.8M
    WORD32 best_nsad_cost = 0xFFFFF;
244
245
    /* If lower layers, l1 or l2, all the 11 modes are evaluated */
246
    /* If L0 layer, all modes excluding DC and Planar are evaluated */
247
16.8M
    if(1 == u1_low_resol)
248
14.7M
        i = 0;
249
2.13M
    else
250
2.13M
        i = 2;
251
252
    /* Find the best non-angular and angular mode till level 4 */
253
197M
    for(; i < 11; i++)
254
181M
    {
255
181M
        mode = gau1_modes_to_eval[i];
256
181M
        g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
257
181M
        sad = pf_4x4_sad_computer(pu1_src, pred, src_stride, 4);
258
181M
        sad_cost = sad + mode_bits_cost[mode];
259
181M
        if(mode < 2)
260
29.4M
        {
261
29.4M
            if(sad_cost < best_nsad_cost)
262
15.0M
            {
263
15.0M
                best_nmode = mode;
264
15.0M
                best_nsad_cost = sad_cost;
265
15.0M
            }
266
29.4M
        }
267
151M
        else
268
151M
        {
269
151M
            if(sad_cost < best_asad_cost)
270
20.5M
            {
271
20.5M
                best_amode = mode;
272
20.5M
                best_asad_cost = sad_cost;
273
20.5M
            }
274
151M
        }
275
181M
    }
276
277
16.8M
    pu1_best_modes[0] = best_amode;
278
16.8M
    pu1_best_sad_costs[0] = best_asad_cost;
279
280
16.8M
    if(1 == u1_low_resol)
281
14.7M
    {
282
14.7M
        pu1_best_modes[1] = best_nmode;
283
14.7M
        pu1_best_sad_costs[1] = best_nsad_cost;
284
14.7M
    }
285
16.8M
}
286
287
/*!
288
******************************************************************************
289
* \if Function name : ihevce_ed_calc_4x4_blk \endif
290
*
291
* \brief: evaluate input 4x4 block for all intra modes and return best sad &
292
*  cost
293
*
294
*****************************************************************************
295
*/
296
static void ihevce_ed_calc_4x4_blk(
297
    ihevce_ed_blk_t *ps_ed,
298
    UWORD8 *pu1_src,
299
    WORD32 src_stride,
300
    UWORD8 *ref,
301
    UWORD16 *mode_bits_cost,
302
    WORD32 *pi4_best_satd,
303
    WORD32 i4_quality_preset,
304
    WORD32 *pi4_best_sad_cost,
305
    ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list)
306
14.7M
{
307
14.7M
    WORD32 i, i_end;
308
14.7M
    UWORD8 mode, best_amode, best_nmode;
309
14.7M
    UWORD8 pred[16];
310
14.7M
    UWORD16 sad;
311
14.7M
    WORD32 sad_cost = 0;
312
14.7M
    WORD32 best_asad_cost = 0xFFFFF;
313
14.7M
    WORD32 best_nsad_cost = 0xFFFFF;
314
14.7M
    UWORD8 au1_best_modes[2];
315
14.7M
    WORD32 ai4_best_sad_costs[2];
316
    /* L1/L2 resolution hence low resolution enable */
317
14.7M
    const WORD32 u1_low_resol = 1;
318
14.7M
    UWORD8 modes_to_eval[2];
319
320
14.7M
    ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes(
321
14.7M
        pu1_src,
322
14.7M
        src_stride,
323
14.7M
        ref,
324
14.7M
        mode_bits_cost,
325
14.7M
        au1_best_modes,
326
14.7M
        ai4_best_sad_costs,
327
14.7M
        u1_low_resol,
328
14.7M
        ps_ipe_optimised_function_list->pf_4x4_sad_computer);
329
330
14.7M
    best_nmode = au1_best_modes[1];
331
14.7M
    best_amode = au1_best_modes[0];
332
14.7M
    best_nsad_cost = ai4_best_sad_costs[1];
333
14.7M
    best_asad_cost = ai4_best_sad_costs[0];
334
14.7M
    *pi4_best_satd = best_asad_cost - mode_bits_cost[best_amode];
335
336
    /* Around best level 4 angular mode, search for best level 2 mode */
337
14.7M
    modes_to_eval[0] = best_amode - 2;
338
14.7M
    modes_to_eval[1] = best_amode + 2;
339
14.7M
    i = 0;
340
14.7M
    i_end = 2;
341
14.7M
    if(best_amode == 2)
342
119k
        i = 1;
343
14.5M
    else if(best_amode == 34)
344
39.8k
        i_end = 1;
345
43.9M
    for(; i < i_end; i++)
346
29.2M
    {
347
29.2M
        mode = modes_to_eval[i];
348
29.2M
        g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
349
29.2M
        sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, pred, src_stride, 4);
350
29.2M
        sad_cost = sad + mode_bits_cost[mode];
351
29.2M
        if(sad_cost < best_asad_cost)
352
225k
        {
353
225k
            best_amode = mode;
354
225k
            best_asad_cost = sad_cost;
355
225k
            *pi4_best_satd = sad;
356
225k
        }
357
29.2M
    }
358
359
14.7M
    if(i4_quality_preset < IHEVCE_QUALITY_P4)
360
9.20M
    {
361
        /* Around best level 2 angular mode, search for best level 1 mode */
362
9.20M
        modes_to_eval[0] = best_amode - 1;
363
9.20M
        modes_to_eval[1] = best_amode + 1;
364
9.20M
        i = 0;
365
9.20M
        i_end = 2;
366
9.20M
        if(best_amode == 2)
367
73.3k
            i = 1;
368
9.13M
        else if(best_amode == 34)
369
21.6k
            i_end = 1;
370
27.5M
        for(; i < i_end; i++)
371
18.3M
        {
372
18.3M
            mode = modes_to_eval[i];
373
18.3M
            g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
374
18.3M
            sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, pred, src_stride, 4);
375
18.3M
            sad_cost = sad + mode_bits_cost[mode];
376
18.3M
            if(sad_cost < best_asad_cost)
377
167k
            {
378
167k
                best_amode = mode;
379
167k
                best_asad_cost = sad_cost;
380
167k
                *pi4_best_satd = sad;
381
167k
            }
382
18.3M
        }
383
9.20M
    }
384
385
14.7M
    if(best_asad_cost < best_nsad_cost)
386
1.03M
    {
387
1.03M
        ps_ed->best_mode = best_amode;
388
1.03M
        *pi4_best_sad_cost = best_asad_cost;
389
1.03M
    }
390
13.6M
    else
391
13.6M
    {
392
13.6M
        ps_ed->best_mode = best_nmode;
393
13.6M
        *pi4_best_sad_cost = best_nsad_cost;
394
13.6M
    }
395
14.7M
    ps_ed->intra_or_inter = 0;
396
14.7M
    ps_ed->merge_success = 0;
397
14.7M
}
398
399
/*!
400
******************************************************************************
401
* \if Function name : ihevce_ed_calc_8x8_blk \endif
402
*
403
* \brief: evaluate input 8x8 block for intra modes basing on the intra mode
404
*  decisions made at 4x4 level. This function also makes a decision whether
405
*  to split blk in to 4x4 partitions or not.
406
*
407
*****************************************************************************
408
*/
409
static void ihevce_ed_calc_8x8_blk(
410
    ihevce_ed_ctxt_t *ps_ed_ctxt,
411
    ihevce_ed_blk_t *ps_ed_8x8,
412
    UWORD8 *pu1_src,
413
    WORD32 src_stride,
414
    WORD32 *nbr_flags_ptr,
415
    WORD32 lambda,
416
    WORD32 *pi4_best_satd,
417
    WORD32 i4_layer_id,
418
    WORD32 i4_quality_preset,
419
    WORD32 *pi4_best_sad_cost_8x8_l1_ipe,
420
    WORD32 *pi4_best_sad_8x8_l1_ipe,
421
    ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
422
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
423
3.67M
{
424
3.67M
    ihevce_ed_blk_t *ps_ed_4x4 = ps_ed_8x8;
425
3.67M
    UWORD8 *pu1_src_arr[4];
426
3.67M
    WORD32 ai4_4x4_best_sad_cost[4];
427
3.67M
    WORD32 nbr_flags_c, nbr_flags_r;
428
3.67M
    UWORD8 *pu1_src_4x4;
429
3.67M
    WORD32 i, j;
430
3.67M
    func_selector_t *ps_func_selector = ps_ed_ctxt->ps_func_selector;
431
3.67M
    ihevc_intra_pred_luma_ref_substitution_ft *pf_intra_pred_luma_ref_substitution =
432
3.67M
        ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
433
434
    /* linearize ref samples for ipe of 8x8 block */
435
3.67M
    nbr_flags_c = nbr_flags_ptr[0];
436
3.67M
    nbr_flags_r = nbr_flags_ptr[1];
437
3.67M
    if(CHECK_TR_AVAILABLE(nbr_flags_r))
438
1.84M
    {
439
1.84M
        SET_TR_AVAILABLE(nbr_flags_c);
440
1.84M
    }
441
1.82M
    else
442
1.82M
    {
443
1.82M
        SET_TR_UNAVAILABLE(nbr_flags_c);
444
1.82M
    }
445
446
3.67M
    pf_intra_pred_luma_ref_substitution(
447
3.67M
        pu1_src - src_stride - 1,
448
3.67M
        pu1_src - src_stride,
449
3.67M
        pu1_src - 1,
450
3.67M
        src_stride,
451
3.67M
        8,
452
3.67M
        nbr_flags_c,
453
3.67M
        &ps_ed_ctxt->au1_ref_8x8[0][0],
454
3.67M
        0);
455
456
11.0M
    for(i = 0; i < 2; i++)
457
7.35M
    {
458
7.35M
        pu1_src_4x4 = pu1_src + i * 4 * src_stride;
459
22.0M
        for(j = 0; j < 2; j++)
460
14.7M
        {
461
14.7M
            WORD32 i4_best_satd;
462
463
14.7M
            pu1_src_arr[i * 2 + j] = pu1_src_4x4;
464
14.7M
            nbr_flags_c = nbr_flags_ptr[i * 8 + j];
465
466
            /* linearize ref samples for ipe of 4x4 block */
467
14.7M
            pf_intra_pred_luma_ref_substitution(
468
14.7M
                pu1_src_4x4 - src_stride - 1,
469
14.7M
                pu1_src_4x4 - src_stride,
470
14.7M
                pu1_src_4x4 - 1,
471
14.7M
                src_stride,
472
14.7M
                4,
473
14.7M
                nbr_flags_c,
474
14.7M
                &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0],
475
14.7M
                0);
476
477
            /* populates mode bits cost */
478
14.7M
            ihevce_intra_populate_mode_bits_cost(
479
14.7M
                &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0], lambda);
480
481
14.7M
            ihevce_ed_calc_4x4_blk(
482
14.7M
                ps_ed_4x4,
483
14.7M
                pu1_src_4x4,
484
14.7M
                src_stride,
485
14.7M
                &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0],
486
14.7M
                &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0],
487
14.7M
                &i4_best_satd,
488
14.7M
                i4_quality_preset,
489
14.7M
                &ai4_4x4_best_sad_cost[i * 2 + j],
490
14.7M
                ps_ipe_optimised_function_list);
491
492
14.7M
            pu1_src_4x4 += 4;
493
14.7M
            ps_ed_4x4 += 1;
494
14.7M
        }
495
7.35M
    }
496
497
    /* 8x8 merge */
498
3.67M
    {
499
3.67M
        UWORD8 pred[64];
500
3.67M
        WORD32 merge_success;
501
3.67M
        WORD32 sad, satd, cost;
502
3.67M
        UWORD16 u2_sum_best_4x4_sad_cost = 0;
503
3.67M
        UWORD16 u2_sum_best_4x4_satd_cost = 0;
504
3.67M
        WORD32 i4_best_8x8_sad, i4_best_8x8_satd = 0;
505
3.67M
        UWORD16 u2_best_8x8_cost = (UWORD16)(-1);
506
3.67M
        UWORD8 u1_best_8x8_mode;
507
3.67M
        UWORD8 modes_to_eval[6];
508
3.67M
        UWORD8 u1_cond_4x4_satd;
509
3.67M
        UWORD8 mode;
510
511
        /* init */
512
3.67M
        ps_ed_4x4 = ps_ed_8x8;
513
3.67M
        u1_best_8x8_mode = mode = ps_ed_4x4[0].best_mode;
514
3.67M
        merge_success =
515
3.67M
            (((ps_ed_4x4[0].best_mode == ps_ed_4x4[1].best_mode) +
516
3.67M
              (ps_ed_4x4[0].best_mode == ps_ed_4x4[2].best_mode) +
517
3.67M
              (ps_ed_4x4[0].best_mode == ps_ed_4x4[3].best_mode)) == 3);
518
3.67M
        *pi4_best_satd = 0;
519
520
18.3M
        for(i = 0; i < 4; i++)
521
14.7M
        {
522
14.7M
            u2_sum_best_4x4_sad_cost += ai4_4x4_best_sad_cost[i];
523
14.7M
            modes_to_eval[i] = ps_ed_4x4[i].best_mode;
524
14.7M
        }
525
526
3.67M
        u1_cond_4x4_satd = ((1 == i4_layer_id) || (!merge_success && i4_quality_preset < IHEVCE_QUALITY_P4));
527
3.67M
        if(u1_cond_4x4_satd)
528
3.05M
        {
529
            /* Get SATD for 4x4 blocks */
530
15.2M
            for(i = 0; i < 4; i++)
531
12.2M
            {
532
12.2M
                mode = modes_to_eval[i];
533
12.2M
                g_apf_lum_ip[g_i4_ip_funcs[mode]](
534
12.2M
                    &ps_ed_ctxt->au1_ref_full_ctb[i][0], 0, &pred[0], 4, 4, mode);
535
536
12.2M
                satd = ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
537
12.2M
                    pu1_src_arr[i], src_stride, &pred[0], 4, NULL, 0);
538
539
12.2M
                (ps_ed_4x4 + i)->i4_4x4_satd = satd;
540
541
12.2M
                u2_sum_best_4x4_satd_cost +=
542
12.2M
                    (satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
543
12.2M
                *pi4_best_satd += satd;
544
12.2M
            }
545
3.05M
        }
546
547
3.67M
        if(!merge_success)
548
395k
        {
549
395k
            UWORD8 i1_start; /* no of modes to evaluate */
550
395k
            UWORD8 ai1_modes[6];
551
395k
            WORD32 i4_merge_success_stage2 = 0;
552
553
            /* Prepare 6 candidates for 8x8 block. Two are DC and planar */
554
395k
            ai1_modes[4] = 0;
555
395k
            ai1_modes[5] = 1;
556
395k
            i1_start = 4;
557
558
            /* Assign along with removing duplicates rest 4 candidates. */
559
1.97M
            for(i = 3; i >= 0; i--)
560
1.58M
            {
561
1.58M
                WORD8 i1_fresh_mode_flag = 1;
562
563
1.58M
                mode = modes_to_eval[i];
564
                /* Check if duplicate already exists in ai1_modes */
565
5.80M
                for(j = i1_start; j < 6; j++)
566
4.22M
                {
567
4.22M
                    if(mode == ai1_modes[j])
568
882k
                        i1_fresh_mode_flag = 0;
569
4.22M
                }
570
1.58M
                if(i1_fresh_mode_flag)
571
697k
                {
572
697k
                    i1_start--;
573
697k
                    ai1_modes[i1_start] = mode;
574
697k
                }
575
1.58M
            }
576
577
395k
            if(i4_quality_preset < IHEVCE_QUALITY_P4)
578
266k
            {
579
                // 7.5 * lambda to incorporate transform flags
580
266k
                u2_sum_best_4x4_satd_cost +=
581
266k
                    (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1)));
582
583
                /* loop over all modes for calculating SATD */
584
1.27M
                for(i = i1_start; i < 6; i++)
585
1.00M
                {
586
1.00M
                    mode = ai1_modes[i];
587
1.00M
                    g_apf_lum_ip[g_i4_ip_funcs[mode]](
588
1.00M
                        &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred[0], 8, 8, mode);
589
590
1.00M
                    satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
591
1.00M
                        pu1_src_arr[0], src_stride, &pred[0], 8, NULL, 0);
592
593
1.00M
                    cost = satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode];
594
595
                    /* Update data corresponding to least 8x8 cost */
596
1.00M
                    if(cost <= u2_best_8x8_cost)
597
489k
                    {
598
489k
                        u2_best_8x8_cost = cost;
599
489k
                        i4_best_8x8_satd = satd;
600
489k
                        u1_best_8x8_mode = mode;
601
489k
                    }
602
1.00M
                }
603
604
                /* 8x8 vs 4x4 decision based on SATD values */
605
266k
                if((u2_best_8x8_cost <= u2_sum_best_4x4_satd_cost) || (u2_best_8x8_cost <= 300))
606
142k
                {
607
142k
                    i4_merge_success_stage2 = 1;
608
142k
                }
609
610
                /* Find the SAD based cost for 8x8 block for best mode */
611
266k
                if(1 == i4_layer_id)
612
185k
                {
613
185k
                    UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
614
185k
                    WORD32 i4_best_8x8_sad_curr;
615
616
185k
                    g_apf_lum_ip[g_i4_ip_funcs[i4_best_8x8_mode]](
617
185k
                        &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred[0], 8, 8, i4_best_8x8_mode);
618
619
185k
                    i4_best_8x8_sad_curr = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
620
185k
                        pu1_src_arr[0], &pred[0], src_stride, 8);
621
622
185k
                    *pi4_best_sad_cost_8x8_l1_ipe =
623
185k
                        i4_best_8x8_sad_curr +
624
185k
                        ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][i4_best_8x8_mode];
625
185k
                    *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
626
185k
                }
627
266k
            }
628
128k
            else /*If high_speed or extreme speed*/
629
128k
            {
630
                // 7.5 * lambda to incorporate transform flags
631
128k
                u2_sum_best_4x4_sad_cost +=
632
128k
                    (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1)));
633
634
                /*Loop over all modes for calculating SAD*/
635
608k
                for(i = i1_start; i < 6; i++)
636
479k
                {
637
479k
                    mode = ai1_modes[i];
638
479k
                    g_apf_lum_ip[g_i4_ip_funcs[mode]](
639
479k
                        &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred[0], 8, 8, mode);
640
641
479k
                    sad = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
642
479k
                        pu1_src_arr[0], &pred[0], src_stride, 8);
643
644
479k
                    cost = sad + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode];
645
646
                    /*Find the data correspoinding to least cost */
647
479k
                    if(cost <= u2_best_8x8_cost)
648
229k
                    {
649
229k
                        u2_best_8x8_cost = cost;
650
229k
                        i4_best_8x8_sad = sad;
651
229k
                        u1_best_8x8_mode = mode;
652
229k
                    }
653
479k
                }
654
655
                /* 8x8 vs 4x4 decision based on SAD values */
656
128k
                if((u2_best_8x8_cost <= u2_sum_best_4x4_sad_cost) || (u2_best_8x8_cost <= 300))
657
67.3k
                {
658
67.3k
                    i4_merge_success_stage2 = 1;
659
67.3k
                    if(1 == i4_layer_id)
660
49.6k
                    {
661
49.6k
                        g_apf_lum_ip[g_i4_ip_funcs[u1_best_8x8_mode]](
662
49.6k
                            &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred[0], 8, 8, u1_best_8x8_mode);
663
49.6k
                        i4_best_8x8_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
664
49.6k
                            pu1_src_arr[0], src_stride, &pred[0], 8, NULL, 0);
665
49.6k
                    }
666
67.3k
                }
667
668
128k
                if(1 == i4_layer_id)
669
98.0k
                {
670
98.0k
                    *pi4_best_sad_cost_8x8_l1_ipe = u2_best_8x8_cost;
671
98.0k
                    *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad;
672
98.0k
                }
673
128k
            }
674
395k
            if(i4_merge_success_stage2)
675
209k
            {
676
209k
                ps_ed_4x4->merge_success = 1;
677
209k
                ps_ed_4x4->best_merge_mode = u1_best_8x8_mode;
678
209k
                *pi4_best_satd = i4_best_8x8_satd;
679
209k
            }
680
395k
        }
681
3.28M
        else
682
3.28M
        {
683
3.28M
            ps_ed_4x4->merge_success = 1;
684
3.28M
            ps_ed_4x4->best_merge_mode = u1_best_8x8_mode;
685
686
3.28M
            if(1 == i4_layer_id)
687
2.69M
            {
688
2.69M
                mode = u1_best_8x8_mode;
689
2.69M
                g_apf_lum_ip[g_i4_ip_funcs[mode]](
690
2.69M
                    &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred[0], 8, 8, mode);
691
692
2.69M
                i4_best_8x8_sad = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
693
2.69M
                    pu1_src_arr[0], &pred[0], src_stride, 8);
694
695
2.69M
                *pi4_best_sad_cost_8x8_l1_ipe =
696
2.69M
                    i4_best_8x8_sad + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode];
697
2.69M
                *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad;
698
699
2.69M
                i4_best_8x8_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
700
2.69M
                    pu1_src_arr[0], src_stride, &pred[0], 8, NULL, 0);
701
2.69M
            }
702
3.28M
            *pi4_best_satd = i4_best_8x8_satd;
703
3.28M
        }
704
3.67M
    }
705
3.67M
}
706
707
/*!
708
******************************************************************************
709
* \if Function name : ihevce_ed_calc_ctb \endif
710
*
711
* \brief: performs L1/L2 8x8 and 4x4 intra mode analysis
712
*
713
*****************************************************************************
714
*/
715
void ihevce_ed_calc_ctb(
716
    ihevce_ed_ctxt_t *ps_ed_ctxt,
717
    ihevce_ed_blk_t *ps_ed_ctb,
718
    ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
719
    UWORD8 *pu1_src,
720
    WORD32 src_stride,
721
    WORD32 num_4x4_blks_x,
722
    WORD32 num_4x4_blks_y,
723
    WORD32 *nbr_flags,
724
    WORD32 i4_layer_id,
725
    ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
726
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
727
368k
{
728
368k
    ihevce_ed_blk_t *ps_ed_8x8;
729
368k
    UWORD8 *pu1_src_8x8;
730
368k
    WORD32 *nbr_flags_ptr;
731
368k
    WORD32 lambda = ps_ed_ctxt->lambda;
732
368k
    WORD32 i, j;
733
368k
    WORD32 z_scan_idx = 0;
734
368k
    WORD32 z_scan_act_idx = 0;
735
736
368k
    if(i4_layer_id == 1)
737
193k
    {
738
193k
        WORD32 i4_i;
739
740
12.5M
        for(i4_i = 0; i4_i < 64; i4_i++)
741
12.3M
        {
742
12.3M
            (ps_ed_ctb + i4_i)->i4_4x4_satd = -1;
743
12.3M
        }
744
745
3.29M
        for(i4_i = 0; i4_i < 16; i4_i++)
746
3.09M
        {
747
3.09M
            ps_ed_ctb_l1->i4_sum_4x4_satd[i4_i] = -2;
748
3.09M
            ps_ed_ctb_l1->i4_min_4x4_satd[i4_i] = 0x7FFFFFFF;
749
3.09M
            ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] = -2;
750
3.09M
            ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] = -2;
751
3.09M
        }
752
753
968k
        for(i4_i = 0; i4_i < 4; i4_i++)
754
774k
        {
755
774k
            ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] = -2;
756
774k
            ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] = -2;
757
774k
            ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] = -2;
758
774k
        }
759
193k
        ps_ed_ctb_l1->i4_32x32_satd[0][0] = -2;
760
193k
        ps_ed_ctb_l1->i4_32x32_satd[0][1] = -2;
761
193k
        ps_ed_ctb_l1->i4_32x32_satd[0][2] = -2;
762
193k
        ps_ed_ctb_l1->i4_32x32_satd[0][3] = -2;
763
764
3.29M
        for(i4_i = 0; i4_i < 16; i4_i++)
765
3.09M
        {
766
3.09M
            ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i4_i] = -1;
767
3.09M
            ps_ed_ctb_l1->i4_sad_cost_me_for_ref[i4_i] = -1;
768
3.09M
            ps_ed_ctb_l1->i4_sad_me_for_ref[i4_i] = -1;
769
3.09M
            ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i4_i] = -1;
770
771
3.09M
            ps_ed_ctb_l1->i4_best_sad_8x8_l1_me_for_decide[i4_i] = -1;
772
773
3.09M
            ps_ed_ctb_l1->i4_best_satd_8x8[i4_i] = -1;
774
3.09M
            ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i4_i] = -1;
775
3.09M
            ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i4_i] = -1;
776
3.09M
        }
777
193k
    }
778
779
368k
    ASSERT((num_4x4_blks_x & 1) == 0);
780
368k
    ASSERT((num_4x4_blks_y & 1) == 0);
781
1.47M
    for(i = 0; i < num_4x4_blks_y / 2; i++)
782
1.11M
    {
783
1.11M
        pu1_src_8x8 = pu1_src + i * 2 * 4 * src_stride;
784
1.11M
        nbr_flags_ptr = &nbr_flags[0] + 2 * 8 * i;
785
786
4.78M
        for(j = 0; j < num_4x4_blks_x / 2; j++)
787
3.67M
        {
788
3.67M
            WORD32 i4_best_satd;
789
3.67M
            WORD32 i4_best_sad_cost_8x8_l1_ipe;
790
3.67M
            WORD32 i4_best_sad_8x8_l1_ipe;
791
792
3.67M
            z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j * 2];
793
3.67M
            z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
794
3.67M
            ASSERT(z_scan_act_idx <= 15);
795
796
3.67M
            ps_ed_8x8 = ps_ed_ctb + z_scan_idx;
797
3.67M
            ihevce_ed_calc_8x8_blk(
798
3.67M
                ps_ed_ctxt,
799
3.67M
                ps_ed_8x8,
800
3.67M
                pu1_src_8x8,
801
3.67M
                src_stride,
802
3.67M
                nbr_flags_ptr,
803
3.67M
                lambda,
804
3.67M
                &i4_best_satd,
805
3.67M
                i4_layer_id,
806
3.67M
                ps_ed_ctxt->i4_quality_preset,
807
3.67M
                &i4_best_sad_cost_8x8_l1_ipe,
808
3.67M
                &i4_best_sad_8x8_l1_ipe,
809
3.67M
                ps_ipe_optimised_function_list,
810
3.67M
                ps_cmn_utils_optimised_function_list);
811
3.67M
            ASSERT(i4_best_satd >= 0);
812
813
3.67M
            if(i4_layer_id == 1)
814
2.97M
            {
815
2.97M
                ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[z_scan_act_idx] =
816
2.97M
                    i4_best_sad_cost_8x8_l1_ipe;
817
2.97M
                ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[z_scan_act_idx] = i4_best_sad_8x8_l1_ipe;
818
2.97M
                ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = i4_best_satd;
819
2.97M
                ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
820
2.97M
                ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
821
2.97M
            }
822
3.67M
            pu1_src_8x8 += 8;
823
3.67M
            nbr_flags_ptr += 2;
824
3.67M
        }
825
1.11M
    }
826
368k
}
827
828
float fast_log2(float val)
829
13.2M
{
830
13.2M
    union { float val; int32_t x; } u = { val };
831
13.2M
    float log_2 = (float)(((u.x >> 23) & 255) - 128);
832
833
13.2M
    u.x &= ~(255 << 23);
834
13.2M
    u.x += 127 << 23;
835
13.2M
    log_2 += ((-1.0f / 3) * u.val + 2) * u.val - 2.0f / 3;
836
13.2M
    return log_2;
837
13.2M
}
838
839
/*!
840
******************************************************************************
841
* \if Function name : ihevce_cu_level_qp_mod \endif
842
*
843
* \brief: Performs CU level QP modulation
844
*
845
*****************************************************************************
846
*/
847
WORD32 ihevce_cu_level_qp_mod(
848
    WORD32 frm_qscale,
849
    WORD32 cu_satd,
850
    long double frm_avg_activity,
851
    float f_mod_strength,
852
    WORD32 *pi4_act_factor,
853
    WORD32 *pi4_q_scale_mod,
854
    rc_quant_t *rc_quant_ctxt)
855
21.8M
{
856
21.8M
    WORD32 cu_qscale;
857
21.8M
    WORD32 cu_qp;
858
859
21.8M
    *pi4_act_factor = (1 << QP_LEVEL_MOD_ACT_FACTOR);
860
21.8M
    if(cu_satd != -1 && (WORD32)frm_avg_activity != 0)
861
12.5M
    {
862
12.5M
        ULWORD64 sq_cur_satd = ((ULWORD64)cu_satd * (ULWORD64)cu_satd);
863
12.5M
        float log2_sq_cur_satd = fast_log2(1 + sq_cur_satd);
864
12.5M
        WORD32 qp_offset = f_mod_strength * (log2_sq_cur_satd - frm_avg_activity);
865
866
12.5M
        ASSERT(USE_SQRT_AVG_OF_SATD_SQR);
867
12.5M
        qp_offset = CLIP3(qp_offset, MIN_QP_MOD_OFFSET, MAX_QP_MOD_OFFSET);
868
12.5M
        *pi4_act_factor *= gad_look_up_activity[qp_offset + ABS(MIN_QP_MOD_OFFSET)];
869
12.5M
        ASSERT(*pi4_act_factor > 0);
870
12.5M
        cu_qscale = ((frm_qscale * (*pi4_act_factor)) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1)));
871
12.5M
        cu_qscale >>= QP_LEVEL_MOD_ACT_FACTOR;
872
12.5M
    }
873
9.30M
    else
874
9.30M
    {
875
9.30M
        cu_qscale = frm_qscale;
876
9.30M
    }
877
21.8M
    cu_qscale = CLIP3(cu_qscale, rc_quant_ctxt->i2_min_qscale, rc_quant_ctxt->i2_max_qscale);
878
21.8M
    cu_qp = rc_quant_ctxt->pi4_qscale_to_qp[cu_qscale];
879
21.8M
    cu_qp = CLIP3(cu_qp, rc_quant_ctxt->i2_min_qp, rc_quant_ctxt->i2_max_qp);
880
21.8M
    *pi4_q_scale_mod = cu_qscale;
881
882
21.8M
    return (cu_qp);
883
21.8M
}
884
885
/*!
886
******************************************************************************
887
* \if Function name : ihevce_ed_frame_init \endif
888
*
889
* \brief: Initialize frame context for early decision
890
*
891
*****************************************************************************
892
*/
893
void ihevce_ed_frame_init(void *pv_ed_ctxt, WORD32 i4_layer_no)
894
287k
{
895
287k
    ihevce_ed_ctxt_t *ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
896
897
287k
    g_apf_lum_ip[IP_FUNC_MODE_0] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_planar_fptr;
898
287k
    g_apf_lum_ip[IP_FUNC_MODE_1] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_dc_fptr;
899
287k
    g_apf_lum_ip[IP_FUNC_MODE_2] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode2_fptr;
900
287k
    g_apf_lum_ip[IP_FUNC_MODE_3TO9] =
901
287k
        ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_3_to_9_fptr;
902
287k
    g_apf_lum_ip[IP_FUNC_MODE_10] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_horz_fptr;
903
287k
    g_apf_lum_ip[IP_FUNC_MODE_11TO17] =
904
287k
        ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_11_to_17_fptr;
905
287k
    g_apf_lum_ip[IP_FUNC_MODE_18_34] =
906
287k
        ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_18_34_fptr;
907
287k
    g_apf_lum_ip[IP_FUNC_MODE_19TO25] =
908
287k
        ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_19_to_25_fptr;
909
287k
    g_apf_lum_ip[IP_FUNC_MODE_26] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_ver_fptr;
910
287k
    g_apf_lum_ip[IP_FUNC_MODE_27TO33] =
911
287k
        ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_27_to_33_fptr;
912
913
287k
    if(i4_layer_no == 1)
914
95.3k
    {
915
95.3k
        ps_ed_ctxt->i8_sum_best_satd = 0;
916
95.3k
        ps_ed_ctxt->i8_sum_sq_best_satd = 0;
917
95.3k
    }
918
287k
}
919
920
/**
921
********************************************************************************
922
*
923
*  @brief  downscales by 2 in horz and vertical direction, creates output of
924
*          size wd/2 * ht/2
925
*
926
*  @param[in]  pu1_src : source pointer
927
*  @param[in]  src_stride : source stride
928
*  @param[out] pu1_dst : destination pointer. Starting of a row.
929
*  @param[in]  dst_stride : destination stride
930
*  @param[in]  wd : width
931
*  @param[in]  ht : height
932
*  @param[in]  pu1_wkg_mem : working memory (atleast of size CEIL16(wd) * ht))
933
*  @param[in]  ht_offset : height offset of the block to be scaled
934
*  @param[in]  block_ht : height of the block to be scaled
935
*  @param[in]  wd_offset : width offset of the block to be scaled
936
*  @param[in]  block_wd : width of the block to be scaled
937
*
938
*  @return void
939
*
940
*  @remarks Assumption made block_ht should me multiple of 2. LANCZOS_SCALER
941
*
942
********************************************************************************
943
*/
944
void ihevce_scaling_filter_mxn(
945
    UWORD8 *pu1_src,
946
    WORD32 src_strd,
947
    UWORD8 *pu1_scrtch,
948
    WORD32 scrtch_strd,
949
    UWORD8 *pu1_dst,
950
    WORD32 dst_strd,
951
    WORD32 ht,
952
    WORD32 wd)
953
464k
{
954
1.53G
#define FILT_TAP_Q 8
955
464k
#define N_TAPS 7
956
464k
    const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 };
957
464k
    WORD32 i, j;
958
464k
    WORD32 tmp;
959
464k
    UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd;
960
464k
    UWORD8 *pu1_scrtch_tmp = pu1_scrtch;
961
962
    /* horizontal filtering */
963
22.2M
    for(i = -3; i < ht + 2; i++)
964
21.7M
    {
965
550M
        for(j = 0; j < wd; j += 2)
966
528M
        {
967
528M
            tmp = (i4_ftaps[3] * pu1_src_tmp[j] +
968
528M
                   i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) +
969
528M
                   i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) +
970
528M
                   i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) +
971
528M
                   (1 << (FILT_TAP_Q - 1))) >>
972
528M
                  FILT_TAP_Q;
973
528M
            pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp);
974
528M
        }
975
21.7M
        pu1_scrtch_tmp += scrtch_strd;
976
21.7M
        pu1_src_tmp += src_strd;
977
21.7M
    }
978
    /* vertical filtering */
979
464k
    pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd;
980
10.1M
    for(i = 0; i < ht; i += 2)
981
9.72M
    {
982
249M
        for(j = 0; j < (wd >> 1); j++)
983
239M
        {
984
239M
            tmp =
985
239M
                (i4_ftaps[3] * pu1_scrtch_tmp[j] +
986
239M
                 i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) +
987
239M
                 i4_ftaps[1] *
988
239M
                     (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) +
989
239M
                 i4_ftaps[0] *
990
239M
                     (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) +
991
239M
                 (1 << (FILT_TAP_Q - 1))) >>
992
239M
                FILT_TAP_Q;
993
239M
            pu1_dst[j] = CLIP_U8(tmp);
994
239M
        }
995
9.72M
        pu1_dst += dst_strd;
996
9.72M
        pu1_scrtch_tmp += (scrtch_strd << 1);
997
9.72M
    }
998
464k
}
999
1000
void ihevce_scale_by_2(
1001
    UWORD8 *pu1_src,
1002
    WORD32 src_strd,
1003
    UWORD8 *pu1_dst,
1004
    WORD32 dst_strd,
1005
    WORD32 wd,
1006
    WORD32 ht,
1007
    UWORD8 *pu1_wkg_mem,
1008
    WORD32 ht_offset,
1009
    WORD32 block_ht,
1010
    WORD32 wd_offset,
1011
    WORD32 block_wd,
1012
    FT_COPY_2D *pf_copy_2d,
1013
    FT_SCALING_FILTER_BY_2 *pf_scaling_filter_mxn)
1014
464k
{
1015
3.25M
#define N_TAPS 7
1016
464k
#define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1))
1017
464k
    UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ];
1018
464k
    UWORD32 cpy_strd = MAX_BLK_SZ;
1019
464k
    UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1);
1020
1021
464k
    UWORD8 *pu1_in, *pu1_out;
1022
464k
    WORD32 in_strd, wkg_mem_strd;
1023
1024
464k
    WORD32 row_start, row_end;
1025
464k
    WORD32 col_start, col_end;
1026
464k
    WORD32 i, fun_select;
1027
464k
    WORD32 ht_tmp, wd_tmp;
1028
464k
    FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2];
1029
1030
464k
    assert((wd & 1) == 0);
1031
464k
    assert((ht & 1) == 0);
1032
464k
    assert(block_wd <= MAX_CTB_SIZE);
1033
464k
    assert(block_ht <= MAX_CTB_SIZE);
1034
1035
    /* function pointers for filtering different dimensions */
1036
464k
    ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn;
1037
464k
    ihevce_scaling_filters[1] = pf_scaling_filter_mxn;
1038
1039
    /* handle boundary blks */
1040
464k
    col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0;
1041
464k
    row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0;
1042
464k
    col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0;
1043
464k
    row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0;
1044
464k
    if(col_end && (wd % block_wd != 0))
1045
18.8k
    {
1046
18.8k
        block_wd = (wd % block_wd);
1047
18.8k
    }
1048
464k
    if(row_end && (ht % block_ht != 0))
1049
19.0k
    {
1050
19.0k
        block_ht = (ht % block_ht);
1051
19.0k
    }
1052
1053
    /* boundary blks needs to be padded, copy src to tmp buffer */
1054
464k
    if(col_start || col_end || row_end || row_start)
1055
337k
    {
1056
337k
        UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd;
1057
1058
337k
        pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start));
1059
337k
        pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start));
1060
337k
        ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end);
1061
337k
        wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end);
1062
337k
        pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp);
1063
337k
        pu1_in = au1_cpy + cpy_strd * 3 + 3;
1064
337k
        in_strd = cpy_strd;
1065
337k
    }
1066
127k
    else
1067
127k
    {
1068
127k
        pu1_in = pu1_src + wd_offset + ht_offset * src_strd;
1069
127k
        in_strd = src_strd;
1070
127k
    }
1071
1072
    /*top padding*/
1073
464k
    if(row_start)
1074
261k
    {
1075
261k
        UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3;
1076
1077
261k
        pu1_cpy = au1_cpy + cpy_strd * (3 - 1);
1078
261k
        memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1079
261k
        pu1_cpy -= cpy_strd;
1080
261k
        memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1081
261k
        pu1_cpy -= cpy_strd;
1082
261k
        memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1083
261k
    }
1084
1085
    /*bottom padding*/
1086
464k
    if(row_end)
1087
261k
    {
1088
261k
        UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd;
1089
1090
261k
        pu1_cpy = pu1_cpy_tmp + cpy_strd;
1091
261k
        memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1092
261k
        pu1_cpy += cpy_strd;
1093
261k
        memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1094
261k
        pu1_cpy += cpy_strd;
1095
261k
        memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
1096
261k
    }
1097
1098
    /*left padding*/
1099
464k
    if(col_start)
1100
227k
    {
1101
227k
        UWORD8 *pu1_cpy_tmp = au1_cpy + 3;
1102
1103
227k
        pu1_cpy = au1_cpy;
1104
12.1M
        for(i = 0; i < block_ht + 6; i++)
1105
11.8M
        {
1106
11.8M
            pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
1107
11.8M
            pu1_cpy += cpy_strd;
1108
11.8M
            pu1_cpy_tmp += cpy_strd;
1109
11.8M
        }
1110
227k
    }
1111
1112
    /*right padding*/
1113
464k
    if(col_end)
1114
227k
    {
1115
227k
        UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1;
1116
1117
227k
        pu1_cpy = au1_cpy + 3 + block_wd;
1118
12.1M
        for(i = 0; i < block_ht + 6; i++)
1119
11.8M
        {
1120
11.8M
            pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
1121
11.8M
            pu1_cpy += cpy_strd;
1122
11.8M
            pu1_cpy_tmp += cpy_strd;
1123
11.8M
        }
1124
227k
    }
1125
1126
464k
    wkg_mem_strd = block_wd >> 1;
1127
464k
    pu1_out = pu1_dst + (wd_offset >> 1);
1128
464k
    fun_select = (block_wd % 16 == 0);
1129
464k
    ihevce_scaling_filters[fun_select](
1130
464k
        pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd);
1131
1132
    /* Left padding of 16 for 1st block of every row */
1133
464k
    if(wd_offset == 0)
1134
227k
    {
1135
227k
        UWORD8 u1_val;
1136
227k
        WORD32 pad_wd = 16;
1137
227k
        WORD32 pad_ht = block_ht >> 1;
1138
227k
        UWORD8 *dst = pu1_dst;
1139
1140
5.48M
        for(i = 0; i < pad_ht; i++)
1141
5.25M
        {
1142
5.25M
            u1_val = dst[0];
1143
5.25M
            memset(&dst[-pad_wd], u1_val, pad_wd);
1144
5.25M
            dst += dst_strd;
1145
5.25M
        }
1146
227k
    }
1147
1148
464k
    if(wd == wd_offset + block_wd)
1149
227k
    {
1150
        /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */
1151
        /* Right padding is done only after processing of last block of that row is done*/
1152
227k
        UWORD8 u1_val;
1153
227k
        WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4;
1154
227k
        WORD32 pad_ht = block_ht >> 1;
1155
227k
        UWORD8 *dst = pu1_dst + (wd >> 1) - 1;
1156
1157
5.48M
        for(i = 0; i < pad_ht; i++)
1158
5.25M
        {
1159
5.25M
            u1_val = dst[0];
1160
5.25M
            memset(&dst[1], u1_val, pad_wd);
1161
5.25M
            dst += dst_strd;
1162
5.25M
        }
1163
1164
227k
        if(ht_offset == 0)
1165
192k
        {
1166
            /* Top padding of 16 is done for 1st row only after we reach end of that row */
1167
192k
            pad_wd = dst_strd;
1168
192k
            pad_ht = 16;
1169
192k
            dst = pu1_dst - 16;
1170
3.26M
            for(i = 1; i <= pad_ht; i++)
1171
3.07M
            {
1172
3.07M
                memcpy(dst - (i * dst_strd), dst, pad_wd);
1173
3.07M
            }
1174
192k
        }
1175
1176
        /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have
1177
         reached end of frame */
1178
227k
        if(ht - ht_offset - block_ht == 0)
1179
192k
        {
1180
192k
            pad_wd = dst_strd;
1181
192k
            pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4;
1182
192k
            dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16;
1183
4.09M
            for(i = 1; i <= pad_ht; i++)
1184
3.90M
                memcpy(dst + (i * dst_strd), dst, pad_wd);
1185
192k
        }
1186
227k
    }
1187
464k
}
1188
1189
/*!
1190
******************************************************************************
1191
* \if Function name : ihevce_decomp_pre_intra_process_row \endif
1192
*
1193
* \brief
1194
*  Row level function which down scales a given row by 2 in horz and vertical
1195
*  direction creates output of size wd/2 * ht/2. When decomposition is done
1196
*  from L1 to L2 pre intra analysis is done on L1
1197
*
1198
*****************************************************************************
1199
*/
1200
void ihevce_decomp_pre_intra_process_row(
1201
    UWORD8 *pu1_src,
1202
    WORD32 src_stride,
1203
    UWORD8 *pu1_dst_decomp,
1204
    WORD32 dst_stride,
1205
    WORD32 layer_wd,
1206
    WORD32 layer_ht,
1207
    UWORD8 *pu1_wkg_mem,
1208
    WORD32 ht_offset,
1209
    WORD32 block_ht,
1210
    WORD32 block_wd,
1211
    WORD32 num_col_blks,
1212
    WORD32 layer_no,
1213
    ihevce_ed_ctxt_t *ps_ed_ctxt,
1214
    ihevce_ed_blk_t *ps_ed_row,
1215
    ihevce_ed_ctb_l1_t *ps_ed_ctb_l1_row,
1216
    WORD32 num_4x4_blks_ctb_y,
1217
    WORD32 num_4x4_blks_last_ctb_x,
1218
    WORD32 skip_decomp,
1219
    WORD32 skip_pre_intra,
1220
    WORD32 row_block_no,
1221
    ctb_analyse_t *ps_ctb_analyse,
1222
    ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
1223
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
1224
532k
{
1225
532k
    WORD32 do_pre_intra_analysis = ((layer_no == 1) || (layer_no == 2)) && (!skip_pre_intra);
1226
532k
    WORD32 col_block_no;
1227
532k
    WORD32 i, j;
1228
1229
532k
    if(!skip_decomp)
1230
227k
    {
1231
227k
        ctb_analyse_t *ps_ctb_analyse_curr = ps_ctb_analyse + row_block_no * num_col_blks;
1232
1233
691k
        for(col_block_no = 0; col_block_no < num_col_blks; col_block_no++)
1234
464k
        {
1235
464k
            ihevce_scale_by_2(
1236
464k
                pu1_src,
1237
464k
                src_stride,
1238
464k
                pu1_dst_decomp,
1239
464k
                dst_stride,
1240
464k
                layer_wd,
1241
464k
                layer_ht,
1242
464k
                pu1_wkg_mem,
1243
464k
                ht_offset,
1244
464k
                block_ht,
1245
464k
                block_wd * col_block_no,
1246
464k
                block_wd,
1247
464k
                ps_cmn_utils_optimised_function_list->pf_copy_2d,
1248
464k
                ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
1249
1250
            /* Disable noise detection */
1251
464k
            memset(
1252
464k
                ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
1253
464k
                0,
1254
464k
                sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
1255
1256
464k
            ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
1257
1258
464k
            ps_ctb_analyse_curr++;
1259
464k
        }
1260
227k
    }
1261
1262
532k
    if(do_pre_intra_analysis)
1263
198k
    {
1264
198k
        ihevce_ed_blk_t *ps_ed_ctb = ps_ed_row;
1265
198k
        ihevce_ed_ctb_l1_t *ps_ed_ctb_l1 = ps_ed_ctb_l1_row;
1266
198k
        WORD32 *nbr_flags_ptr = &ps_ed_ctxt->ai4_nbr_flags[0];
1267
198k
        UWORD8 *pu1_src_pre_intra = pu1_src + (ht_offset * src_stride);
1268
198k
        WORD32 num_4x4_blks_in_ctb = block_wd >> 2;
1269
198k
        WORD32 src_inc_pre_intra = num_4x4_blks_in_ctb * 4;
1270
198k
        WORD32 inc_ctb = num_4x4_blks_in_ctb * num_4x4_blks_in_ctb;
1271
1272
        /* To analyse any given CTB we need to set the availability flags of the
1273
         * following neighbouring CTB: BL,L,TL,T,TR */
1274
        /* copy the neighbor flags for a general ctb (ctb inside the frame); not any corners */
1275
198k
        memcpy(
1276
198k
            ps_ed_ctxt->ai4_nbr_flags,
1277
198k
            gau4_nbr_flags_8x8_4x4blks,
1278
198k
            sizeof(gau4_nbr_flags_8x8_4x4blks));
1279
1280
        /* set top flags unavailable for first ctb row */
1281
198k
        if(ht_offset == 0)
1282
170k
        {
1283
1.22M
            for(j = 0; j < num_4x4_blks_in_ctb; j++)
1284
1.05M
            {
1285
1.05M
                SET_T_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
1286
1.05M
                SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
1287
1.05M
                SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
1288
1.05M
            }
1289
170k
        }
1290
1291
        /* set bottom left flags as not available for last row */
1292
198k
        if(ht_offset + block_ht >= layer_ht)
1293
170k
        {
1294
1.22M
            for(j = 0; j < num_4x4_blks_in_ctb; j++)
1295
1.05M
            {
1296
1.05M
                SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(num_4x4_blks_ctb_y - 1) * 8 + j]);
1297
1.05M
            }
1298
170k
        }
1299
1300
        /* set left flags unavailable for 1st ctb col */
1301
1.41M
        for(j = 0; j < num_4x4_blks_ctb_y; j++)
1302
1.21M
        {
1303
1.21M
            SET_L_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1304
1.21M
            SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1305
1.21M
            SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1306
1.21M
        }
1307
1308
567k
        for(col_block_no = 0; col_block_no < num_col_blks; col_block_no++)
1309
368k
        {
1310
368k
            if(col_block_no == 1)
1311
24.3k
            {
1312
                /* For the rest of the ctbs, set left flags available */
1313
168k
                for(j = 0; j < num_4x4_blks_ctb_y; j++)
1314
143k
                {
1315
143k
                    SET_L_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1316
143k
                }
1317
143k
                for(j = 0; j < num_4x4_blks_ctb_y - 1; j++)
1318
119k
                {
1319
119k
                    SET_BL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
1320
119k
                    SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(j + 1) * 8]);
1321
119k
                }
1322
24.3k
                if(ht_offset != 0)
1323
11.1k
                {
1324
11.1k
                    SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[0]);
1325
11.1k
                }
1326
24.3k
            }
1327
1328
368k
            if(col_block_no == num_col_blks - 1)
1329
198k
            {
1330
                /* set top right flags unavailable for last ctb col */
1331
1.41M
                for(i = 0; i < num_4x4_blks_ctb_y; i++)
1332
1.21M
                {
1333
1.21M
                    SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[i * 8 + num_4x4_blks_last_ctb_x - 1]);
1334
1.21M
                }
1335
198k
            }
1336
1337
            /* Call intra analysis for the ctb */
1338
368k
            ihevce_ed_calc_ctb(
1339
368k
                ps_ed_ctxt,
1340
368k
                ps_ed_ctb,
1341
368k
                ps_ed_ctb_l1,
1342
368k
                pu1_src_pre_intra,
1343
368k
                src_stride,
1344
368k
                (col_block_no == num_col_blks - 1) ? num_4x4_blks_last_ctb_x : num_4x4_blks_in_ctb,
1345
368k
                num_4x4_blks_ctb_y,
1346
368k
                nbr_flags_ptr,
1347
368k
                layer_no,
1348
368k
                ps_ipe_optimised_function_list,
1349
368k
                ps_cmn_utils_optimised_function_list);
1350
368k
            pu1_src_pre_intra += src_inc_pre_intra;
1351
368k
            ps_ed_ctb += inc_ctb;
1352
368k
            ps_ed_ctb_l1 += 1;
1353
368k
        }
1354
198k
    }
1355
532k
}
1356
1357
/*!
1358
******************************************************************************
1359
* \if Function name : ihevce_decomp_pre_intra_process \endif
1360
*
1361
* \brief
1362
*  Frame level function to decompose given layer L0 into coarser layers and
1363
*  perform intra analysis on layers below L0
1364
*
1365
*****************************************************************************
1366
*/
1367
void ihevce_decomp_pre_intra_process(
1368
    void *pv_ctxt,
1369
    ihevce_lap_output_params_t *ps_lap_out_prms,
1370
    frm_ctb_ctxt_t *ps_frm_ctb_prms,
1371
    void *pv_multi_thrd_ctxt,
1372
    WORD32 thrd_id,
1373
    WORD32 i4_ping_pong)
1374
95.3k
{
1375
95.3k
    ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = pv_ctxt;
1376
95.3k
    ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thrd_id];
1377
95.3k
    multi_thrd_ctxt_t *ps_multi_thrd = (multi_thrd_ctxt_t *)pv_multi_thrd_ctxt;
1378
95.3k
    WORD32 i4_num_layers = ps_ctxt->i4_num_layers;
1379
95.3k
    UWORD8 *pu1_wkg_mem = ps_ctxt->au1_wkg_mem;
1380
95.3k
    ihevce_ed_ctxt_t *ps_ed_ctxt = ps_ctxt->ps_ed_ctxt;
1381
95.3k
    ihevce_ed_ctb_l1_t *ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1;
1382
95.3k
    ihevce_ed_blk_t *ps_ed;
1383
95.3k
    WORD32 i4_layer_no;
1384
95.3k
    WORD32 end_of_layer;
1385
95.3k
    UWORD8 *pu1_src, *pu1_dst;
1386
95.3k
    WORD32 src_stride, dst_stride;
1387
95.3k
    WORD32 i4_layer_wd, i4_layer_ht;
1388
95.3k
    WORD32 ht_offset, block_ht, row_block_no, num_row_blocks;
1389
95.3k
    WORD32 block_wd, num_col_blks;
1390
95.3k
    WORD32 skip_decomp, skip_pre_intra;
1391
95.3k
    WORD32 inc_ctb;
1392
1393
95.3k
    ASSERT(i4_num_layers >= 3);
1394
95.3k
    ps_ctxt->as_layers[0].pu1_inp = (UWORD8 *)ps_lap_out_prms->s_input_buf.pv_y_buf;
1395
95.3k
    ps_ctxt->as_layers[0].i4_inp_stride = ps_lap_out_prms->s_input_buf.i4_y_strd;
1396
95.3k
    ps_ctxt->as_layers[0].i4_actual_wd = ps_lap_out_prms->s_input_buf.i4_y_wd;
1397
95.3k
    ps_ctxt->as_layers[0].i4_actual_ht = ps_lap_out_prms->s_input_buf.i4_y_ht;
1398
1399
    /* This loop does decomp & intra by picking jobs from job queue */
1400
382k
    for(i4_layer_no = 0; i4_layer_no < i4_num_layers; i4_layer_no++)
1401
287k
    {
1402
287k
        WORD32 idx = 0;
1403
1404
287k
        src_stride = ps_ctxt->as_layers[i4_layer_no].i4_inp_stride;
1405
287k
        pu1_src = ps_ctxt->as_layers[i4_layer_no].pu1_inp;
1406
287k
        i4_layer_wd = ps_ctxt->as_layers[i4_layer_no].i4_actual_wd;
1407
287k
        i4_layer_ht = ps_ctxt->as_layers[i4_layer_no].i4_actual_ht;
1408
287k
        pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp;
1409
287k
        dst_stride = ps_ctxt->as_layers[i4_layer_no + 1].i4_inp_stride;
1410
287k
        block_wd = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_wd;
1411
287k
        block_ht = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_ht;
1412
287k
        num_col_blks = ps_ctxt->as_layers[i4_layer_no].i4_num_col_blks;
1413
287k
        num_row_blocks = ps_ctxt->as_layers[i4_layer_no].i4_num_row_blks;
1414
287k
        inc_ctb = (block_wd >> 2) * (block_wd >> 2);
1415
287k
        end_of_layer = 0;
1416
287k
        skip_pre_intra = 1;
1417
287k
        skip_decomp = 0;
1418
287k
        if(i4_layer_no >= (ps_ctxt->i4_num_layers - 1))
1419
95.3k
        {
1420
95.3k
            skip_decomp = 1;
1421
95.3k
        }
1422
1423
        /* ------------ Loop over all the CTB rows & perform Decomp --------------- */
1424
1.46M
        while(0 == end_of_layer)
1425
1.17M
        {
1426
1.17M
            job_queue_t *ps_pre_enc_job;
1427
1.17M
            WORD32 num_4x4_blks_ctb_y = 0, num_4x4_blks_last_ctb_x = 0;
1428
1429
            /* Get the current row from the job queue */
1430
1.17M
            ps_pre_enc_job = (job_queue_t *)ihevce_pre_enc_grp_get_next_job(
1431
1.17M
                pv_multi_thrd_ctxt, (DECOMP_JOB_LYR0 + i4_layer_no), 1, i4_ping_pong);
1432
1433
            /* If all rows are done, set the end of layer flag to 1, */
1434
1.17M
            if(NULL == ps_pre_enc_job)
1435
287k
            {
1436
287k
                end_of_layer = 1;
1437
287k
            }
1438
890k
            else
1439
890k
            {
1440
                /* Obtain the current row's details from the job */
1441
890k
                row_block_no = ps_pre_enc_job->s_job_info.s_decomp_job_info.i4_vert_unit_row_no;
1442
890k
                ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = row_block_no;
1443
890k
                ht_offset = row_block_no * block_ht;
1444
1445
890k
                if(row_block_no < (num_row_blocks))
1446
331k
                {
1447
331k
                    pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp +
1448
331k
                              ((block_ht >> 1) * dst_stride * row_block_no);
1449
1450
                    /* call the row level processing function */
1451
331k
                    ihevce_decomp_pre_intra_process_row(
1452
331k
                        pu1_src,
1453
331k
                        src_stride,
1454
331k
                        pu1_dst,
1455
331k
                        dst_stride,
1456
331k
                        i4_layer_wd,
1457
331k
                        i4_layer_ht,
1458
331k
                        pu1_wkg_mem,
1459
331k
                        ht_offset,
1460
331k
                        block_ht,
1461
331k
                        block_wd,
1462
331k
                        num_col_blks,
1463
331k
                        i4_layer_no,
1464
331k
                        ps_ed_ctxt,
1465
331k
                        ps_ed,
1466
331k
                        ps_ed_ctb_l1,
1467
331k
                        num_4x4_blks_ctb_y,
1468
331k
                        num_4x4_blks_last_ctb_x,
1469
331k
                        skip_decomp,
1470
331k
                        skip_pre_intra,
1471
331k
                        row_block_no,
1472
331k
                        ps_ctxt->ps_ctb_analyse,
1473
331k
                        &ps_ctxt->s_ipe_optimised_function_list,
1474
331k
                        &ps_ctxt->s_cmn_opt_func);
1475
331k
                }
1476
890k
                idx++;
1477
                /* set the output dependency */
1478
890k
                ihevce_pre_enc_grp_job_set_out_dep(
1479
890k
                    pv_multi_thrd_ctxt, ps_pre_enc_job, i4_ping_pong);
1480
890k
            }
1481
1.17M
        }
1482
287k
        ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = idx;
1483
1484
        /* ------------ For the same rows perform preintra if required --------------- */
1485
287k
        ihevce_ed_frame_init(ps_ed_ctxt, i4_layer_no);
1486
1487
287k
        if((1 == i4_layer_no) && (IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset))
1488
31.4k
        {
1489
31.4k
            WORD32 vert_ctr, ctb_ctr, i;
1490
31.4k
            WORD32 ctb_ctr_blks = ps_ctxt->as_layers[1].i4_num_col_blks;
1491
31.4k
            WORD32 vert_ctr_blks = ps_ctxt->as_layers[1].i4_num_row_blks;
1492
1493
31.4k
            if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
1494
31.4k
               (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))
1495
2.29k
            {
1496
4.61k
                for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
1497
2.32k
                {
1498
2.32k
                    ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 =
1499
2.32k
                        ps_ctxt->ps_ed_ctb_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz;
1500
1501
4.64k
                    for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
1502
2.32k
                    {
1503
2.32k
                        ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr;
1504
1505
39.4k
                        for(i = 0; i < 16; i++)
1506
37.1k
                        {
1507
37.1k
                            ps_ed_ctb_curr_l1->i4_best_sad_cost_8x8_l1_ipe[i] = 0x7fffffff;
1508
37.1k
                            ps_ed_ctb_curr_l1->i4_best_sad_8x8_l1_ipe[i] = 0x7fffffff;
1509
37.1k
                        }
1510
2.32k
                    }
1511
2.32k
                }
1512
2.29k
            }
1513
31.4k
        }
1514
1515
287k
#if DISABLE_L2_IPE_IN_PB_L1_IN_B
1516
287k
        if(((2 == i4_layer_no) && (ps_lap_out_prms->i4_pic_type == IV_I_FRAME ||
1517
82.0k
                                   ps_lap_out_prms->i4_pic_type == IV_IDR_FRAME)) ||
1518
255k
           ((1 == i4_layer_no) &&
1519
95.3k
            (ps_lap_out_prms->i4_temporal_lyr_id <= TEMPORAL_LAYER_DISABLE)) ||
1520
170k
           ((IHEVCE_QUALITY_P6 != ps_ctxt->i4_quality_preset) && (0 != i4_layer_no)))
1521
#else
1522
        if((0 != i4_layer_no) &&
1523
           (1 != ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
1524
                  (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))))
1525
#endif
1526
171k
        {
1527
171k
            WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
1528
1529
171k
            ps_ed_ctxt->lambda = ps_ctxt->ai4_lambda[i4_layer_no];
1530
171k
            if(0 == i4_layer_no)
1531
0
            {
1532
0
                ps_ed_ctxt->ps_ed_pic = NULL;
1533
0
                ps_ed_ctxt->ps_ed = NULL;
1534
0
                ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
1535
0
                ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
1536
0
            }
1537
171k
            else if(1 == i4_layer_no)
1538
93.0k
            {
1539
93.0k
                ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer1_buf;
1540
93.0k
                ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer1_buf;
1541
93.0k
                ps_ed_ctxt->ps_ed_ctb_l1_pic = ps_ctxt->ps_ed_ctb_l1;
1542
93.0k
                ps_ed_ctxt->ps_ed_ctb_l1 = ps_ctxt->ps_ed_ctb_l1;
1543
93.0k
            }
1544
78.5k
            else if(2 == i4_layer_no)
1545
77.3k
            {
1546
77.3k
                ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer2_buf;
1547
77.3k
                ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer2_buf;
1548
77.3k
                ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
1549
77.3k
                ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
1550
77.3k
            }
1551
1552
171k
            skip_decomp = 1;
1553
171k
            skip_pre_intra = 0;
1554
1555
704k
            for(idx = 0; idx < i4_num_rows; idx++)
1556
532k
            {
1557
532k
                WORD32 num_4x4_blks_ctb_y = 0, num_4x4_blks_last_ctb_x = 0;
1558
1559
                /* Obtain the current row's details from the job */
1560
532k
                row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
1561
532k
                ht_offset = row_block_no * block_ht;
1562
1563
532k
                if(row_block_no < (num_row_blocks))
1564
200k
                {
1565
200k
                    pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp +
1566
200k
                              ((block_ht >> 1) * dst_stride * row_block_no);
1567
1568
200k
                    if(i4_layer_no == 1 || i4_layer_no == 2)
1569
198k
                    {
1570
198k
                        ps_ed = ps_ed_ctxt->ps_ed + (row_block_no * inc_ctb * (num_col_blks));
1571
198k
                        ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1 + (row_block_no * num_col_blks);
1572
198k
                        ps_ed_ctxt->i4_quality_preset = ps_ctxt->i4_quality_preset;
1573
198k
                        num_4x4_blks_last_ctb_x = block_wd >> 2;
1574
198k
                        num_4x4_blks_ctb_y = block_ht >> 2;
1575
198k
                        if(row_block_no == num_row_blocks - 1)
1576
170k
                        {
1577
170k
                            if(i4_layer_ht % block_ht)
1578
2.11k
                            {
1579
2.11k
                                num_4x4_blks_ctb_y = ((i4_layer_ht % block_ht) + 3) >> 2;
1580
2.11k
                            }
1581
170k
                        }
1582
198k
                        if(i4_layer_wd % block_wd)
1583
8.32k
                        {
1584
8.32k
                            num_4x4_blks_last_ctb_x = ((i4_layer_wd % block_wd) + 3) >> 2;
1585
8.32k
                        }
1586
198k
                    }
1587
1588
                    /* call the row level processing function */
1589
200k
                    ihevce_decomp_pre_intra_process_row(
1590
200k
                        pu1_src,
1591
200k
                        src_stride,
1592
200k
                        pu1_dst,
1593
200k
                        dst_stride,
1594
200k
                        i4_layer_wd,
1595
200k
                        i4_layer_ht,
1596
200k
                        pu1_wkg_mem,
1597
200k
                        ht_offset,
1598
200k
                        block_ht,
1599
200k
                        block_wd,
1600
200k
                        num_col_blks,
1601
200k
                        i4_layer_no,
1602
200k
                        ps_ed_ctxt,
1603
200k
                        ps_ed,
1604
200k
                        ps_ed_ctb_l1,
1605
200k
                        num_4x4_blks_ctb_y,
1606
200k
                        num_4x4_blks_last_ctb_x,
1607
200k
                        skip_decomp,
1608
200k
                        skip_pre_intra,
1609
200k
                        row_block_no,
1610
200k
                        NULL,
1611
200k
                        &ps_ctxt->s_ipe_optimised_function_list,
1612
200k
                        &ps_ctxt->s_cmn_opt_func);
1613
200k
                }
1614
1615
532k
                if(1 == i4_layer_no)
1616
289k
                {
1617
289k
                    ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
1618
289k
                }
1619
532k
            }
1620
11.8M
            for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
1621
11.6M
            {
1622
11.6M
                ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
1623
11.6M
            }
1624
171k
            ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
1625
171k
        }
1626
1627
287k
#if DISABLE_L2_IPE_IN_PB_L1_IN_B
1628
287k
        if((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
1629
94.5k
           (((i4_layer_no == 2) && (ps_lap_out_prms->i4_pic_type == ISLICE)) ||
1630
92.2k
            ((i4_layer_no == 1) && (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))))
1631
4.59k
        {
1632
4.59k
            WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
1633
4.59k
            if(1 == i4_layer_no)
1634
2.29k
            {
1635
9.18k
                for(idx = 0; idx < i4_num_rows; idx++)
1636
6.88k
                {
1637
6.88k
                    row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
1638
1639
6.88k
                    {
1640
6.88k
                        ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
1641
6.88k
                    }
1642
6.88k
                }
1643
2.29k
            }
1644
316k
            for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
1645
312k
            {
1646
312k
                ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
1647
312k
            }
1648
4.59k
            ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
1649
4.59k
        }
1650
#else
1651
        if((i4_layer_no != 0) && ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
1652
                                  (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE)))
1653
        {
1654
            WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
1655
            for(idx = 0; idx < i4_num_rows; idx++)
1656
            {
1657
                row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
1658
                if(1 == i4_layer_no)
1659
                {
1660
                    ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
1661
                }
1662
            }
1663
            for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
1664
            {
1665
                ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
1666
            }
1667
            ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
1668
        }
1669
#endif
1670
287k
    }
1671
95.3k
}
1672
1673
/*!
1674
************************************************************************
1675
* \brief
1676
*    return number of records used by decomp pre intra
1677
*
1678
************************************************************************
1679
*/
1680
WORD32 ihevce_decomp_pre_intra_get_num_mem_recs(void)
1681
13.5k
{
1682
13.5k
    return (NUM_DECOMP_PRE_INTRA_MEM_RECS);
1683
13.5k
}
1684
1685
/*!
1686
************************************************************************
1687
* @brief
1688
*    return each record attributes of  decomp pre intra
1689
************************************************************************
1690
*/
1691
WORD32 ihevce_decomp_pre_intra_get_mem_recs(
1692
    iv_mem_rec_t *ps_mem_tab, WORD32 i4_num_proc_thrds, WORD32 i4_mem_space)
1693
6.76k
{
1694
    /* memories should be requested assuming worst case requirememnts */
1695
1696
    /* Module context structure */
1697
6.76k
    ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_size = sizeof(ihevce_decomp_pre_intra_master_ctxt_t);
1698
6.76k
    ps_mem_tab[DECOMP_PRE_INTRA_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
1699
6.76k
    ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_alignment = 8;
1700
1701
    /* Thread context structure */
1702
6.76k
    ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_size =
1703
6.76k
        i4_num_proc_thrds * sizeof(ihevce_decomp_pre_intra_ctxt_t);
1704
6.76k
    ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
1705
6.76k
    ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_alignment = 8;
1706
1707
    /* early decision context structure */
1708
6.76k
    ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_size = i4_num_proc_thrds * sizeof(ihevce_ed_ctxt_t);
1709
6.76k
    ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
1710
6.76k
    ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_alignment = 8;
1711
1712
6.76k
    return (NUM_DECOMP_PRE_INTRA_MEM_RECS);
1713
6.76k
}
1714
1715
/*!
1716
************************************************************************
1717
* @brief
1718
*    Init decomp pre intra context
1719
************************************************************************
1720
*/
1721
void *ihevce_decomp_pre_intra_init(
1722
    iv_mem_rec_t *ps_mem_tab,
1723
    ihevce_static_cfg_params_t *ps_init_prms,
1724
    WORD32 i4_num_proc_thrds,
1725
    func_selector_t *ps_func_selector,
1726
    WORD32 i4_resolution_id,
1727
    UWORD8 u1_is_popcnt_available)
1728
6.76k
{
1729
6.76k
    ihevce_decomp_pre_intra_master_ctxt_t *ps_mstr_ctxt = ps_mem_tab[DECOMP_PRE_INTRA_CTXT].pv_base;
1730
6.76k
    ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].pv_base;
1731
6.76k
    ihevce_ed_ctxt_t *ps_ed_ctxt = ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].pv_base;
1732
6.76k
    ihevce_tgt_params_t *ps_tgt_prms = &ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id];
1733
6.76k
    WORD32 min_cu_size = 1 << ps_init_prms->s_config_prms.i4_min_log2_cu_size;
1734
6.76k
    WORD32 a_wd[MAX_NUM_HME_LAYERS], a_ht[MAX_NUM_HME_LAYERS];
1735
6.76k
    WORD32 a_disp_wd[MAX_NUM_LAYERS], a_disp_ht[MAX_NUM_LAYERS];
1736
6.76k
    WORD32 n_tot_layers;
1737
6.76k
    WORD32 i, j, k;
1738
1739
    /* Get the height and width of each layer */
1740
6.76k
    *a_wd = ps_tgt_prms->i4_width + SET_CTB_ALIGN(ps_tgt_prms->i4_width, min_cu_size);
1741
6.76k
    *a_ht = ps_tgt_prms->i4_height + SET_CTB_ALIGN(ps_tgt_prms->i4_height, min_cu_size);
1742
6.76k
    n_tot_layers = hme_derive_num_layers(1, a_wd, a_ht, a_disp_wd, a_disp_ht);
1743
6.76k
    ps_mstr_ctxt->i4_num_proc_thrds = i4_num_proc_thrds;
1744
13.5k
    for(i = 0; i < ps_mstr_ctxt->i4_num_proc_thrds; i++)
1745
6.76k
    {
1746
6.76k
        ps_mstr_ctxt->aps_decomp_pre_intra_thrd_ctxt[i] = ps_ctxt;
1747
6.76k
        ps_ctxt->i4_num_layers = n_tot_layers;
1748
6.76k
        ps_ctxt->ps_ed_ctxt = ps_ed_ctxt;
1749
27.9k
        for(j = 0; j < n_tot_layers; j++)
1750
21.1k
        {
1751
            /** If CTB size= 64, decomp_blk_wd = 64 for L0, 32 for L1 , 16 for L2, 8 for L3 */
1752
21.1k
            WORD32 max_ctb_size = 1 << ps_init_prms->s_config_prms.i4_max_log2_cu_size;
1753
21.1k
            WORD32 decomp_blk_wd = max_ctb_size >> j;
1754
21.1k
            WORD32 decomp_blk_ht = max_ctb_size >> j;
1755
1756
21.1k
            ps_ctxt->as_layers[j].i4_actual_wd = a_wd[j];
1757
21.1k
            ps_ctxt->as_layers[j].i4_actual_ht = a_ht[j];
1758
21.1k
            if(0 == j)
1759
6.76k
            {
1760
6.76k
                ps_ctxt->as_layers[j].i4_padded_ht = a_ht[j];
1761
6.76k
                ps_ctxt->as_layers[j].i4_padded_wd = a_wd[j];
1762
6.76k
            }
1763
14.3k
            else
1764
14.3k
            {
1765
14.3k
                ps_ctxt->as_layers[j].i4_padded_ht = a_ht[j] + 32 + 4;
1766
14.3k
                ps_ctxt->as_layers[j].i4_padded_wd = a_wd[j] + 32 + 4;
1767
14.3k
            }
1768
21.1k
            ps_ctxt->as_layers[j].pu1_inp = NULL;
1769
21.1k
            ps_ctxt->as_layers[j].i4_inp_stride = 0;
1770
21.1k
            ps_ctxt->as_layers[j].i4_decomp_blk_ht = decomp_blk_ht;
1771
21.1k
            ps_ctxt->as_layers[j].i4_decomp_blk_wd = decomp_blk_wd;
1772
21.1k
            ps_ctxt->as_layers[j].i4_num_row_blks = ((a_ht[j] + (decomp_blk_ht - 1)) / decomp_blk_ht);
1773
21.1k
            ps_ctxt->as_layers[j].i4_num_col_blks = ((a_wd[j] + (decomp_blk_wd - 1)) / decomp_blk_wd);
1774
1.45M
            for(k = 0; k < MAX_NUM_CTB_ROWS_FRM; k++)
1775
1.43M
            {
1776
1.43M
                ps_ctxt->as_layers[j].ai4_curr_row_no[k] = -1;
1777
1.43M
            }
1778
21.1k
            ps_ctxt->as_layers[j].i4_num_rows_processed = 0;
1779
21.1k
        }
1780
6.76k
        ps_ctxt->i4_quality_preset = ps_tgt_prms->i4_quality_preset;
1781
6.76k
        if(ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P7)
1782
397
        {
1783
397
            ps_ctxt->i4_quality_preset = IHEVCE_QUALITY_P6;
1784
397
        }
1785
6.76k
        if(ps_init_prms->s_coding_tools_prms.i4_vqet &
1786
6.76k
           (1 << BITPOS_IN_VQ_TOGGLE_FOR_CONTROL_TOGGLER))
1787
0
        {
1788
0
            if(ps_init_prms->s_coding_tools_prms.i4_vqet &
1789
0
               (1 << BITPOS_IN_VQ_TOGGLE_FOR_ENABLING_NOISE_PRESERVATION))
1790
0
            {
1791
0
                ps_ctxt->i4_enable_noise_detection = 1;
1792
0
            }
1793
0
            else
1794
0
            {
1795
0
                ps_ctxt->i4_enable_noise_detection = 0;
1796
0
            }
1797
0
        }
1798
6.76k
        else
1799
6.76k
        {
1800
6.76k
            ps_ctxt->i4_enable_noise_detection = 0;
1801
6.76k
        }
1802
6.76k
        ihevce_cmn_utils_instr_set_router(
1803
6.76k
            &ps_ctxt->s_cmn_opt_func, u1_is_popcnt_available, ps_init_prms->e_arch_type);
1804
6.76k
        ihevce_ipe_instr_set_router(
1805
6.76k
            &ps_ctxt->s_ipe_optimised_function_list, ps_init_prms->e_arch_type);
1806
1807
6.76k
        ps_ed_ctxt->ps_func_selector = ps_func_selector;
1808
1809
6.76k
        ps_ctxt++;
1810
6.76k
        ps_ed_ctxt++;
1811
6.76k
    }
1812
    /* return the handle to caller */
1813
6.76k
    return ((void *)ps_mstr_ctxt);
1814
6.76k
}
1815
1816
/*!
1817
************************************************************************
1818
* @brief
1819
*    Init decomp pre intra layer buffers
1820
************************************************************************
1821
*/
1822
void ihevce_decomp_pre_intra_frame_init(
1823
    void *pv_ctxt,
1824
    UWORD8 **ppu1_decomp_lyr_bufs,
1825
    WORD32 *pi4_lyr_buf_stride,
1826
    ihevce_ed_blk_t *ps_layer1_buf,
1827
    ihevce_ed_blk_t *ps_layer2_buf,
1828
    ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
1829
    WORD32 i4_ol_sad_lambda_qf,
1830
    ctb_analyse_t *ps_ctb_analyse)
1831
95.3k
{
1832
95.3k
    ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = pv_ctxt;
1833
95.3k
    ihevce_decomp_pre_intra_ctxt_t *ps_ctxt;
1834
95.3k
    WORD32 i, j;
1835
1836
190k
    for(i = 0; i < ps_master_ctxt->i4_num_proc_thrds; i++)
1837
95.3k
    {
1838
95.3k
        ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i];
1839
1840
        /* L0 layer (actual input) is registered in process call */
1841
287k
        for(j = 1; j < ps_ctxt->i4_num_layers; j++)
1842
192k
        {
1843
192k
            ps_ctxt->as_layers[j].i4_inp_stride = pi4_lyr_buf_stride[j - 1];
1844
192k
            ps_ctxt->as_layers[j].pu1_inp = ppu1_decomp_lyr_bufs[j - 1];
1845
1846
            /* Populating the buffer pointers for layer1 and layer2 buffers to store the
1847
            structure for each 4x4 block after pre intra analysis on their respective layers */
1848
192k
            if(j == 1)
1849
95.3k
            {
1850
95.3k
                WORD32 sad_lambda_l1 = (3 * i4_ol_sad_lambda_qf >> 2);
1851
95.3k
                WORD32 temp = 1 << LAMBDA_Q_SHIFT;
1852
95.3k
                WORD32 lambda = ((temp) > sad_lambda_l1) ? temp : sad_lambda_l1;
1853
1854
95.3k
                ps_ctxt->ps_layer1_buf = ps_layer1_buf;
1855
95.3k
                ps_ctxt->ps_ed_ctb_l1 = ps_ed_ctb_l1;
1856
95.3k
                ps_ctxt->ai4_lambda[j] = lambda;
1857
95.3k
            }
1858
96.8k
            else if(j == 2)
1859
95.3k
            {
1860
95.3k
                WORD32 sad_lambda_l2 = i4_ol_sad_lambda_qf >> 1;
1861
95.3k
                WORD32 temp = 1 << LAMBDA_Q_SHIFT;
1862
95.3k
                WORD32 lambda = ((temp) > sad_lambda_l2) ? temp : sad_lambda_l2;
1863
1864
95.3k
                ps_ctxt->ps_layer2_buf = ps_layer2_buf;
1865
95.3k
                ps_ctxt->ai4_lambda[j] = lambda;
1866
95.3k
            }
1867
1.41k
            else
1868
1.41k
            {
1869
1.41k
                ps_ctxt->ai4_lambda[j] = -1;
1870
1.41k
            }
1871
192k
        }
1872
1873
        /* make the ps_ctb_analyse refernce as a part of the private context */
1874
95.3k
        ps_ctxt->ps_ctb_analyse = ps_ctb_analyse;
1875
95.3k
    }
1876
95.3k
}
1877
1878
/**
1879
*******************************************************************************
1880
*
1881
* @brief Merge Sort function.
1882
*
1883
* @par Description:
1884
*     This function sorts the data in the input array in ascending
1885
*     order using merge sort algorithm. Intermediate data obtained in
1886
*     merge sort are stored in output 2-D array.
1887
*
1888
* @param[in]
1889
*   pi4_input_val  :   Input 1-D array
1890
*   aai4_output_val:   Output 2-D array containing elements sorted in sets of
1891
*                      4,16,64 etc.
1892
*   i4_length      : length of the array
1893
*   i4_ip_sort_level: Input sort level. Specifies the level upto which array is sorted.
1894
*                     It should be 1 if the array is unsorted. Should be 4 if array is sorted
1895
*                     in sets of 4.
1896
*   i4_op_sort_level: Output sort level. Specify the level upto which sorting is required.
1897
*                     If it is given as length of array it sorts for whole array.
1898
*
1899
*******************************************************************************
1900
*/
1901
void ihevce_merge_sort(
1902
    WORD32 *pi4_input_val,
1903
    WORD32 aai4_output_val[][64],
1904
    WORD32 i4_length,
1905
    WORD32 i4_ip_sort_level,
1906
    WORD32 i4_op_sort_level)
1907
566k
{
1908
566k
    WORD32 i, j, k;
1909
566k
    WORD32 count, level;
1910
566k
    WORD32 temp[64];
1911
566k
    WORD32 *pi4_temp_buf_cpy;
1912
566k
    WORD32 *pi4_temp = &temp[0];
1913
566k
    WORD32 calc_level;
1914
1915
566k
    pi4_temp_buf_cpy = pi4_temp;
1916
1917
566k
    GETRANGE(calc_level, i4_op_sort_level / i4_ip_sort_level);
1918
1919
566k
    calc_level = calc_level - 1;
1920
1921
    /*** This function is written under the assumption that we need only intermediate values of
1922
    sort in the range of 4,16,64 etc. ***/
1923
566k
    ASSERT((calc_level % 2) == 0);
1924
1925
    /** One iteration of this for loop does 1 sets of sort and produces one intermediate value in 2 iterations **/
1926
2.86M
    for(level = 0; level < calc_level; level++)
1927
2.29M
    {
1928
        /** Merges adjacent sets of elements based on current sort level **/
1929
17.9M
        for(count = 0; count < i4_length; (count = count + (i4_ip_sort_level * 2)))
1930
15.6M
        {
1931
15.6M
            i = 0;
1932
15.6M
            j = 0;
1933
15.6M
            if(pi4_input_val[i4_ip_sort_level - 1] < pi4_input_val[i4_ip_sort_level])
1934
185k
            {
1935
                /*** Condition for early exit ***/
1936
185k
                memcpy(&pi4_temp[0], pi4_input_val, sizeof(WORD32) * i4_ip_sort_level * 2);
1937
185k
            }
1938
15.4M
            else
1939
15.4M
            {
1940
103M
                for(k = 0; k < (i4_ip_sort_level * 2); k++)
1941
87.7M
                {
1942
87.7M
                    if((i < i4_ip_sort_level) && (j < i4_ip_sort_level))
1943
51.8M
                    {
1944
51.8M
                        if(pi4_input_val[i] > pi4_input_val[j + i4_ip_sort_level])
1945
11.0M
                        {
1946
                            /** copy to output array **/
1947
11.0M
                            pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level];
1948
11.0M
                            j++;
1949
11.0M
                        }
1950
40.8M
                        else
1951
40.8M
                        {
1952
                            /** copy to output array **/
1953
40.8M
                            pi4_temp[k] = pi4_input_val[i];
1954
40.8M
                            i++;
1955
40.8M
                        }
1956
51.8M
                    }
1957
35.8M
                    else if(i == i4_ip_sort_level)
1958
32.8M
                    {
1959
                        /** copy the remaining data to output array **/
1960
32.8M
                        pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level];
1961
32.8M
                        j++;
1962
32.8M
                    }
1963
3.06M
                    else
1964
3.06M
                    {
1965
                        /** copy the remaining data to output array **/
1966
3.06M
                        pi4_temp[k] = pi4_input_val[i];
1967
3.06M
                        i++;
1968
3.06M
                    }
1969
87.7M
                }
1970
15.4M
            }
1971
15.6M
            pi4_input_val += (i4_ip_sort_level * 2);
1972
15.6M
            pi4_temp += (i4_ip_sort_level * 2);
1973
15.6M
        }
1974
2.29M
        pi4_input_val = pi4_temp - i4_length;
1975
1976
2.29M
        if(level % 2)
1977
1.14M
        {
1978
            /** Assign a temp address for storing next sort level output as we will not need this data as output **/
1979
1.14M
            pi4_temp = pi4_temp_buf_cpy;
1980
1.14M
        }
1981
1.14M
        else
1982
1.14M
        {
1983
            /** Assign address for storing the intermediate data into output 2-D array **/
1984
1.14M
            pi4_temp = aai4_output_val[level / 2];
1985
1.14M
        }
1986
2.29M
        i4_ip_sort_level *= 2;
1987
2.29M
    }
1988
566k
}
1989
1990
/*!
1991
************************************************************************
1992
* @brief
1993
*   Calculate the average activities at 16*16 (8*8 in L1) and 32*32
1994
*   (8*8 in L2) block sizes. As this function accumulates activities
1995
*   across blocks of a frame, this needs to be called by only one thread
1996
*   and only after ensuring the processing of entire frame is done
1997
************************************************************************
1998
*/
1999
void ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit(
2000
    void *pv_pre_intra_ctxt,
2001
    pre_enc_me_ctxt_t *ps_curr_out,
2002
    frm_ctb_ctxt_t *ps_frm_ctb_prms)
2003
93.0k
{
2004
93.0k
    ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = pv_pre_intra_ctxt;
2005
93.0k
    ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[0];
2006
2007
93.0k
    ULWORD64 u8_frame_8x8_sum_act_sqr = 0;
2008
93.0k
    LWORD64 ai8_frame_8x8_sum_act_sqr[2] = { 0, 0 };
2009
93.0k
    WORD32 ai4_frame_8x8_sum_act[2] = { 0, 0 };
2010
93.0k
    WORD32 ai4_frame_8x8_sum_blks[2] = { 0, 0 };
2011
2012
93.0k
    LWORD64 ai8_frame_16x16_sum_act_sqr[3] = { 0, 0, 0 };
2013
93.0k
    WORD32 ai4_frame_16x16_sum_act[3] = { 0, 0, 0 };
2014
93.0k
    WORD32 ai4_frame_16x16_sum_blks[3] = { 0, 0, 0 };
2015
2016
93.0k
    LWORD64 ai8_frame_32x32_sum_act_sqr[3] = { 0, 0, 0 };
2017
93.0k
    WORD32 ai4_frame_32x32_sum_act[3] = { 0, 0, 0 };
2018
93.0k
    WORD32 ai4_frame_32x32_sum_blks[3] = { 0, 0, 0 };
2019
2020
93.0k
    ihevce_ed_ctb_l1_t *ps_ed_ctb_pic_l1 = ps_curr_out->ps_ed_ctb_l1;
2021
93.0k
    ihevce_ed_blk_t *ps_ed_blk_l1 = ps_curr_out->ps_layer1_buf;
2022
93.0k
    WORD32 ctb_wd = ps_ctxt->as_layers[1].i4_decomp_blk_wd;
2023
93.0k
    WORD32 h_ctb_cnt = ps_ctxt->as_layers[1].i4_num_col_blks;
2024
93.0k
    WORD32 v_ctb_cnt = ps_ctxt->as_layers[1].i4_num_row_blks;
2025
93.0k
    WORD32 sub_blk_cnt = ((ctb_wd >> 2) * (ctb_wd >> 2));
2026
93.0k
    WORD32 i4_avg_noise_satd;
2027
93.0k
    WORD32 ctb_ctr, vert_ctr;
2028
93.0k
    WORD32 i, j, k;
2029
2030
93.0k
    {
2031
        /* Calculate min noise threshold */
2032
        /* Min noise threshold is calculated by taking average of lowest 1% satd val in
2033
         * the complete 4x4 frame satds */
2034
12.3M
#define MAX_SATD 64
2035
93.0k
#define SATD_NOISE_FLOOR_THRESHOLD 16
2036
93.0k
#define MIN_BLKS 2
2037
93.0k
        WORD32 i4_layer_wd = ps_ctxt->as_layers[1].i4_actual_wd;
2038
93.0k
        WORD32 i4_layer_ht = ps_ctxt->as_layers[1].i4_actual_ht;
2039
93.0k
        WORD32 i4_min_blk = ((MIN_BLKS * (i4_layer_wd >> 1) * (i4_layer_ht >> 1)) / 100);
2040
93.0k
        WORD32 i4_total_blks = 0;
2041
93.0k
        WORD32 satd_hist[MAX_SATD];
2042
93.0k
        LWORD64 i8_acc_satd = 0;
2043
2044
93.0k
        memset(satd_hist, 0, sizeof(satd_hist));
2045
12.4M
        for(i = 0; i < sub_blk_cnt * h_ctb_cnt * v_ctb_cnt; i++)
2046
12.3M
        {
2047
12.3M
            if(ps_ed_blk_l1[i].i4_4x4_satd >= 0 && ps_ed_blk_l1[i].i4_4x4_satd < MAX_SATD)
2048
11.3M
            {
2049
11.3M
                satd_hist[ps_ed_blk_l1[i].i4_4x4_satd]++;
2050
11.3M
            }
2051
12.3M
        }
2052
202k
        for(i = 0; i < MAX_SATD && i4_total_blks <= i4_min_blk; i++)
2053
109k
        {
2054
109k
            i4_total_blks += satd_hist[i];
2055
109k
            i8_acc_satd += (i * satd_hist[i]);
2056
109k
        }
2057
93.0k
        if(i4_total_blks < i4_min_blk)
2058
247
        {
2059
247
            i4_avg_noise_satd = SATD_NOISE_FLOOR_THRESHOLD;
2060
247
        }
2061
92.8k
        else
2062
92.8k
        {
2063
92.8k
            i4_avg_noise_satd = (WORD32)(i8_acc_satd + (i4_total_blks >> 1)) / i4_total_blks;
2064
92.8k
        }
2065
93.0k
        ps_curr_out->i4_avg_noise_thrshld_4x4 = i4_avg_noise_satd;
2066
93.0k
    }
2067
2068
200k
    for(vert_ctr = 0; vert_ctr < v_ctb_cnt; vert_ctr++)
2069
107k
    {
2070
107k
        ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 =
2071
107k
            ps_ed_ctb_pic_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz;
2072
107k
        ihevce_ed_blk_t *ps_ed = ps_ed_blk_l1 + (vert_ctr * sub_blk_cnt * h_ctb_cnt);
2073
2074
301k
        for(ctb_ctr = 0; ctb_ctr < h_ctb_cnt; ctb_ctr++, ps_ed += sub_blk_cnt)
2075
193k
        {
2076
193k
            ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr;
2077
193k
            WORD8 b8_satd_eval[4];
2078
193k
            WORD32 ai4_satd_4x4[64];
2079
193k
            WORD32 ai4_satd_8x8[16];  // derived from accumulating 4x4 satds
2080
193k
            WORD32 ai4_satd_16x16[4] = { 0 };  // derived from accumulating 8x8 satds
2081
193k
            WORD32 i4_satd_32x32 = 0;  // derived from accumulating 8x8 satds
2082
            /* This 2-D array will contain 4x4 satds sorted in ascending order in sets
2083
             * of 4, 16, 64  For example : '5 10 2 7 6 12 3 1' array input will return
2084
             * '2 5 7 10 1 3 6 12' if sorted in sets of 4 */
2085
193k
            WORD32 aai4_sort_4_16_64_satd[3][64];
2086
            /* This 2-D array will contain 8x8 satds sorted in ascending order in sets of
2087
             * 4, 16***/
2088
193k
            WORD32 aai4_sort_4_16_satd[2][64];
2089
2090
193k
            memset(b8_satd_eval, 1, sizeof(b8_satd_eval));
2091
968k
            for(i = 0; i < 4; i++)
2092
774k
            {
2093
774k
                ihevce_ed_blk_t *ps_ed_b32 = &ps_ed[i * 16];
2094
2095
3.87M
                for(j = 0; j < 4; j++)
2096
3.09M
                {
2097
3.09M
                    ihevce_ed_blk_t *ps_ed_b16 = &ps_ed_b32[j * 4];
2098
3.09M
                    WORD32 satd_sum = 0;
2099
3.09M
                    WORD32 blk_cnt = 0;
2100
2101
15.4M
                    for(k = 0; k < 4; k++)
2102
12.3M
                    {
2103
12.3M
                        ihevce_ed_blk_t *ps_ed_b4 = &ps_ed_b16[k];
2104
2105
12.3M
                        if(-1 != ps_ed_b4->i4_4x4_satd)
2106
11.9M
                        {
2107
11.9M
#define SUB_NOISE_THRSHLD 0
2108
#if SUB_NOISE_THRSHLD
2109
                            ps_ed_b4->i4_4x4_satd = ps_ed_b4->i4_4x4_satd - i4_avg_noise_satd;
2110
                            if(ps_ed_b4->i4_4x4_satd < 0)
2111
                            {
2112
                                ps_ed_b4->i4_4x4_satd = 0;
2113
                            }
2114
#else
2115
11.9M
                            if(ps_ed_b4->i4_4x4_satd < i4_avg_noise_satd)
2116
2.10k
                            {
2117
2.10k
                                ps_ed_b4->i4_4x4_satd = i4_avg_noise_satd;
2118
2.10k
                            }
2119
11.9M
#endif
2120
11.9M
                            blk_cnt++;
2121
11.9M
                            satd_sum += ps_ed_b4->i4_4x4_satd;
2122
11.9M
                        }
2123
12.3M
                        ai4_satd_4x4[i * 16 + j * 4 + k] = ps_ed_b4->i4_4x4_satd;
2124
12.3M
                    }
2125
3.09M
                    ASSERT(blk_cnt == 0 || blk_cnt == 4);
2126
3.09M
                    if(blk_cnt == 0)
2127
120k
                    {
2128
120k
                        satd_sum = -1;
2129
120k
                    }
2130
3.09M
                    ai4_satd_8x8[i * 4 + j] = satd_sum;
2131
3.09M
                    ai4_satd_16x16[i] += satd_sum;
2132
3.09M
                    i4_satd_32x32 += satd_sum;
2133
3.09M
                    ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j] = satd_sum;
2134
3.09M
                }
2135
774k
            }
2136
2137
193k
            {
2138
                /* This function will sort 64 elements in array ai4_satd_4x4 in ascending order
2139
                 *  to 3 arrays in sets of 4, 16, 64 into the 2-D array aai4_min_4_16_64_satd */
2140
193k
                WORD32 array_length = sizeof(ai4_satd_4x4) / sizeof(WORD32);
2141
193k
                ihevce_merge_sort(
2142
193k
                    &ai4_satd_4x4[0], aai4_sort_4_16_64_satd, array_length, 1, 64);
2143
2144
                /* This function will sort 64 elements in array ai4_satd_8x8 in ascending order
2145
                 *  to 2 arrays in sets of 4, 16 into the 2-D array aai4_sum_4_16_satd_ctb */
2146
193k
                array_length = sizeof(ai4_satd_8x8) / sizeof(WORD32);
2147
193k
                ihevce_merge_sort(
2148
193k
                    &ai4_satd_8x8[0], aai4_sort_4_16_satd, array_length, 1, 16);
2149
193k
            }
2150
2151
            /* Populate avg satd to calculate modulation index and activity factors */
2152
            /* 16x16 */
2153
968k
            for(i = 0; i < 4; i++)
2154
774k
            {
2155
3.87M
                for(j = 0; j < 4; j++)
2156
3.09M
                {
2157
3.09M
                    WORD32 satd_sum = ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j];
2158
3.09M
                    WORD32 satd_min = aai4_sort_4_16_64_satd[0][i * 16 + j * 4 + MEDIAN_CU_TU];
2159
2160
3.09M
                    ASSERT(-2 != satd_sum);
2161
3.09M
                    ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j] = satd_min;
2162
2163
3.09M
                    if(-1 != satd_sum)
2164
2.97M
                    {
2165
2.97M
                        ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = satd_sum;
2166
2.97M
                        ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = satd_min;
2167
2168
2.97M
                        u8_frame_8x8_sum_act_sqr += (satd_sum * satd_sum);
2169
2.97M
                        ai4_frame_8x8_sum_act[0] += satd_sum;
2170
2.97M
                        ai8_frame_8x8_sum_act_sqr[0] += (satd_sum * satd_sum);
2171
2.97M
                        ai4_frame_8x8_sum_blks[0] += 1;
2172
2.97M
                        ai4_frame_8x8_sum_act[1] += satd_min;
2173
2.97M
                        ai8_frame_8x8_sum_act_sqr[1] += (satd_min * satd_min);
2174
2.97M
                        ai4_frame_8x8_sum_blks[1] += 1;
2175
2.97M
                    }
2176
120k
                    else
2177
120k
                    {
2178
120k
                        ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = -1;
2179
120k
                        ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = -1;
2180
120k
                        b8_satd_eval[i] = 0;
2181
120k
                    }
2182
3.09M
                }
2183
2184
774k
                if(b8_satd_eval[i])
2185
744k
                {
2186
744k
                    ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = ai4_satd_16x16[i];
2187
744k
                    ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = aai4_sort_4_16_satd[0][i * 4 + MEDIAN_CU_TU];
2188
744k
                    ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = aai4_sort_4_16_64_satd[1][i * 16 + MEDIAN_CU_TU_BY_2];
2189
2190
2.97M
                    for(k = 0; k < 3; k++)
2191
2.23M
                    {
2192
2.23M
                        WORD32 satd = ps_ed_ctb_curr_l1->i4_16x16_satd[i][k];
2193
2194
2.23M
                        ai4_frame_16x16_sum_act[k] += satd;
2195
2.23M
                        ai8_frame_16x16_sum_act_sqr[k] += (satd * satd);
2196
2.23M
                        ai4_frame_16x16_sum_blks[k] += 1;
2197
2.23M
                    }
2198
744k
                }
2199
30.1k
                else
2200
30.1k
                {
2201
30.1k
                    ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = -1;
2202
30.1k
                    ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = -1;
2203
30.1k
                    ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = -1;
2204
30.1k
                }
2205
774k
            }
2206
2207
            /*32x32*/
2208
193k
            if(b8_satd_eval[0] && b8_satd_eval[1] && b8_satd_eval[2] && b8_satd_eval[3])
2209
178k
            {
2210
178k
                WORD32 aai4_sort_4_satd[1][64];
2211
178k
                WORD32 array_length = sizeof(ai4_satd_16x16) / sizeof(WORD32);
2212
178k
                WORD32 satd;
2213
2214
                /* Sort 4 elements in ascending order */
2215
178k
                ihevce_merge_sort(ai4_satd_16x16, aai4_sort_4_satd, array_length, 1, 4);
2216
2217
178k
                ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] = aai4_sort_4_satd[0][MEDIAN_CU_TU];
2218
178k
                ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] = aai4_sort_4_16_satd[1][MEDIAN_CU_TU_BY_2];
2219
178k
                ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] = aai4_sort_4_16_64_satd[2][MEDIAN_CU_TU_BY_4];
2220
178k
                ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = i4_satd_32x32;
2221
2222
715k
                for(k = 0; k < 3; k++)
2223
536k
                {
2224
536k
                    WORD32 satd = ps_ed_ctb_curr_l1->i4_32x32_satd[0][k];
2225
2226
536k
                    ai4_frame_32x32_sum_act[k] += satd;
2227
536k
                    ai8_frame_32x32_sum_act_sqr[k] += (satd * satd);
2228
536k
                    ai4_frame_32x32_sum_blks[k] += 1;
2229
536k
                }
2230
178k
            }
2231
14.7k
            else
2232
14.7k
            {
2233
14.7k
                ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] = -1;
2234
14.7k
                ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] = -1;
2235
14.7k
                ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] = -1;
2236
14.7k
                ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = -1;
2237
14.7k
            }
2238
193k
        }
2239
107k
    }
2240
2241
279k
    for(i = 0; i < 2; i++)
2242
186k
    {
2243
        /*8x8*/
2244
186k
#if USE_SQRT_AVG_OF_SATD_SQR
2245
186k
        ps_curr_out->i8_curr_frame_8x8_sum_act[i] = ai8_frame_8x8_sum_act_sqr[i];
2246
#else
2247
        ps_curr_out->i8_curr_frame_8x8_sum_act[i] = ai4_frame_8x8_sum_act[i];
2248
#endif
2249
186k
        ps_curr_out->i4_curr_frame_8x8_sum_act_for_strength[i] = ai4_frame_8x8_sum_act[i];
2250
186k
        ps_curr_out->i4_curr_frame_8x8_num_blks[i] = ai4_frame_8x8_sum_blks[i];
2251
186k
        ps_curr_out->u8_curr_frame_8x8_sum_act_sqr = u8_frame_8x8_sum_act_sqr;
2252
2253
        /*16x16*/
2254
186k
#if USE_SQRT_AVG_OF_SATD_SQR
2255
186k
        ps_curr_out->i8_curr_frame_16x16_sum_act[i] = ai8_frame_16x16_sum_act_sqr[i];
2256
#else
2257
        ps_curr_out->i8_curr_frame_16x16_sum_act[i] = ai4_frame_16x16_sum_act[i];
2258
#endif
2259
186k
        ps_curr_out->i4_curr_frame_16x16_num_blks[i] = ai4_frame_16x16_sum_blks[i];
2260
2261
        /*32x32*/
2262
186k
#if USE_SQRT_AVG_OF_SATD_SQR
2263
186k
        ps_curr_out->i8_curr_frame_32x32_sum_act[i] = ai8_frame_32x32_sum_act_sqr[i];
2264
#else
2265
        ps_curr_out->i8_curr_frame_32x32_sum_act[i] = ai4_frame_32x32_sum_act[i];
2266
#endif
2267
186k
        ps_curr_out->i4_curr_frame_32x32_num_blks[i] = ai4_frame_32x32_sum_blks[i];
2268
186k
    }
2269
2270
    /*16x16*/
2271
93.0k
#if USE_SQRT_AVG_OF_SATD_SQR
2272
93.0k
    ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai8_frame_16x16_sum_act_sqr[2];
2273
#else
2274
    ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai4_frame_16x16_sum_act[2];
2275
#endif
2276
93.0k
    ps_curr_out->i4_curr_frame_16x16_num_blks[2] = ai4_frame_16x16_sum_blks[2];
2277
2278
    /*32x32*/
2279
93.0k
#if USE_SQRT_AVG_OF_SATD_SQR
2280
93.0k
    ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai8_frame_32x32_sum_act_sqr[2];
2281
#else
2282
    ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai4_frame_32x32_sum_act[2];
2283
#endif
2284
93.0k
    ps_curr_out->i4_curr_frame_32x32_num_blks[2] = ai4_frame_32x32_sum_blks[2];
2285
93.0k
}
2286
2287
/*!
2288
************************************************************************
2289
* @brief
2290
*  accumulate L1 intra satd across all threads.
2291
*  Note: call to this function has to be made after all threads have
2292
*  finished preintra processing
2293
*
2294
************************************************************************
2295
*/
2296
LWORD64 ihevce_decomp_pre_intra_get_frame_satd(void *pv_ctxt, WORD32 *wd, WORD32 *ht)
2297
93.3k
{
2298
93.3k
    ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = pv_ctxt;
2299
93.3k
    ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[0];
2300
93.3k
    LWORD64 satd_sum = ps_ctxt->ps_ed_ctxt->i8_sum_best_satd;
2301
93.3k
    WORD32 i;
2302
2303
93.3k
    *wd = ps_ctxt->as_layers[1].i4_actual_wd;
2304
93.3k
    *ht = ps_ctxt->as_layers[1].i4_actual_ht;
2305
93.3k
    for(i = 1; i < ps_master_ctxt->i4_num_proc_thrds; i++)
2306
0
    {
2307
0
        ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i];
2308
0
        satd_sum += ps_ctxt->ps_ed_ctxt->i8_sum_best_satd;
2309
0
    }
2310
2311
93.3k
    return satd_sum;
2312
93.3k
}
2313
2314
LWORD64 ihevce_decomp_pre_intra_get_frame_satd_squared(void *pv_ctxt, WORD32 *wd, WORD32 *ht)
2315
93.3k
{
2316
93.3k
    ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = pv_ctxt;
2317
93.3k
    ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[0];
2318
93.3k
    LWORD64 satd_sum = ps_ctxt->ps_ed_ctxt->i8_sum_sq_best_satd;
2319
93.3k
    WORD32 i;
2320
2321
93.3k
    *wd = ps_ctxt->as_layers[1].i4_actual_wd;
2322
93.3k
    *ht = ps_ctxt->as_layers[1].i4_actual_ht;
2323
93.3k
    for(i = 1; i < ps_master_ctxt->i4_num_proc_thrds; i++)
2324
0
    {
2325
0
        ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i];
2326
0
        satd_sum += ps_ctxt->ps_ed_ctxt->i8_sum_sq_best_satd;
2327
0
    }
2328
2329
93.3k
    return satd_sum;
2330
93.3k
}