Coverage Report

Created: 2025-08-28 06:38

/src/libhevc/common/ihevc_quant_iquant_ssd.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
/**
21
 *******************************************************************************
22
 * @file
23
 *  ihevc_quant_iquant_ssd.c
24
 *
25
 * @brief
26
 *  Contains function definitions for quantization, followed by Inverse
27
 *  quantization to find transform domain SSD
28
 *
29
 * @author
30
 *  100453, 100578
31
 *
32
 * @par List of Functions:
33
 *   - ihevc_quant_iquant_ssd()
34
 *   - ihevc_quant_iquant_ssd_flat_scale_mat()
35
 *
36
 * @remarks
37
 *  None
38
 *
39
 *******************************************************************************
40
 */
41
42
#include <stdio.h>
43
#include <string.h>
44
#include <stdlib.h>
45
#include "ihevc_typedefs.h"
46
#include "ihevc_macros.h"
47
#include "ihevc_platform_macros.h"
48
#include "ihevc_defs.h"
49
#include "ihevc_debug.h"
50
#include "ihevc_trans_tables.h"
51
#include "ihevc_quant_iquant_ssd.h"
52
#include "ihevc_func_selector.h"
53
#include "ihevc_trans_macros.h"
54
#include <assert.h>
55
56
/*****************************************************************************/
57
/* Globals                                                                   */
58
/*****************************************************************************/
59
60
61
/**
62
 *******************************************************************************
63
 *
64
 * @brief
65
 *  This function performs quantization, followed by Inverse
66
 *  quantization to find transform domain SSD
67
 *
68
 * @par Description:
69
 *  Performs quantization on coeffs
70
 *
71
 * @param[in] pi2_coeffs
72
 *  4x4 Coeffs
73
 *
74
 * @param[in] pi2_quant_coeff
75
 *  Scaling Matrix
76
 *
77
 * @param[out] pi2_dst
78
 *  Output 4x4 coefficients
79
 *
80
 * @param[in] qp_div
81
 *  Quantization parameter / 6
82
 *
83
 * @param[in] qp_rem
84
 *  Quantization parameter % 6
85
 *
86
 * @param[in] src_strd
87
 *  Input stride
88
 *
89
 * @param[in] dst_strd
90
 *  Output Stride
91
 *
92
 * @param[out] csbf
93
 *  coded sub block flag
94
 *
95
 * @param[in] csbf_strd
96
 *  coded sub block flag
97
 *
98
 * @param[out] zero_col
99
 *  zero column flag
100
 *
101
 * @param[out] zero_row
102
 *  zero column flag
103
 *
104
 * @returns  cbf
105
 * coded block flag
106
 *
107
 * @remarks
108
 *  None
109
 *
110
 *******************************************************************************
111
 */
112
113
WORD32 ihevc_quant_iquant_ssd
114
    (
115
    WORD16 *pi2_coeffs,
116
    WORD16 *pi2_quant_coeff,
117
    WORD16 *pi2_q_dst,
118
    WORD16 *pi2_iq_dst,
119
    WORD32  trans_size,
120
    WORD32 qp_div,/* qpscaled / 6 */
121
    WORD32 qp_rem,/* qpscaled % 6 */
122
    WORD32 q_add,
123
    WORD32 *pi4_quant_round_factor_0_1,
124
    WORD32 *pi4_quant_round_factor_1_2,
125
    WORD32 src_strd,
126
    WORD32 dst_q_strd,
127
    WORD32 dst_iq_strd,
128
    UWORD8 *csbf,
129
    WORD32 csbf_strd,
130
    WORD32 *zero_col,
131
    WORD32 *zero_row,
132
    WORD16 *pi2_dequant_coeff,
133
    LWORD64 *pi8_cost
134
    )
135
4.70M
{
136
4.70M
    WORD32 i, j;
137
4.70M
    WORD32 log2_size;
138
4.70M
    WORD16 *pi2_q_dst_orig;
139
4.70M
    WORD32 cbf = 0;
140
4.70M
    WORD32 bit_depth,shift_iq;
141
4.70M
    WORD32 val;
142
4.70M
    WORD16 i2_temp;
143
4.70M
    WORD32 ssd_cost = 0;
144
145
4.70M
    (void)pi4_quant_round_factor_0_1;
146
4.70M
    (void)pi4_quant_round_factor_1_2;
147
4.70M
    pi2_q_dst_orig  = pi2_q_dst;
148
149
    /* Quant initialization */
150
4.70M
    GETRANGE(log2_size, trans_size);
151
4.70M
    log2_size -= 1;
152
153
4.70M
    bit_depth = 8 + 0;
154
4.70M
    shift_iq = bit_depth + log2_size - 5;
155
156
41.9M
    for(i = 0; i < trans_size; i++)
157
37.2M
    {
158
551M
        for(j = 0; j < trans_size; j++)
159
514M
        {
160
            /*  Back up the coefficients before Quantization    */
161
514M
            i2_temp = pi2_coeffs[j];
162
163
            /*  Quantization    */
164
514M
            QUANT(pi2_q_dst[j], pi2_coeffs[j],
165
514M
                  pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
166
514M
                  log2_size, q_add);
167
168
            /*  Inverse Quantization    */
169
514M
            IQUANT(pi2_iq_dst[j],
170
514M
                   pi2_q_dst[j], /*pi2_src[index*src_strd]*/
171
514M
                   pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
172
                   /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
173
514M
                   shift_iq,
174
514M
                   qp_div);
175
176
            /*  SSD Computation & Accumulation  */
177
514M
            val = i2_temp - pi2_iq_dst[j];
178
514M
            ssd_cost += val*val;
179
180
514M
        }
181
182
37.2M
        pi2_q_dst   += dst_q_strd;
183
37.2M
        pi2_iq_dst  += dst_iq_strd;
184
37.2M
        pi2_quant_coeff += trans_size;
185
37.2M
        pi2_coeffs += src_strd;
186
37.2M
        pi2_dequant_coeff += trans_size;
187
37.2M
    }
188
189
    /* Store the cost */
190
4.70M
    *pi8_cost = ssd_cost;
191
192
    /* CSBF update */
193
4.70M
    {
194
4.70M
        WORD32 block_row, block_col;
195
4.70M
        WORD32 row, col;
196
4.70M
        WORD16 *pi2_block;
197
4.70M
        UWORD32 temp_zero_col = 0;
198
4.70M
        UWORD32 temp_zero_row = 0;
199
200
4.70M
        pi2_q_dst = pi2_q_dst_orig;
201
202
14.0M
        for(block_row = 0; block_row < trans_size; block_row += 4)
203
9.30M
        {
204
            //block_col is incrementing by 1 for easy update of csbf pointer
205
41.4M
            for(block_col = 0; block_col < trans_size / 4; block_col++)
206
32.1M
            {
207
32.1M
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
208
32.1M
                *(csbf + block_col) = 0;
209
210
148M
                for(row = 0; row < 4; row++)
211
119M
                {
212
587M
                    for(col = 0; col < 4; col++)
213
470M
                    {
214
470M
                        if(pi2_block[row * dst_q_strd + col] != 0)
215
3.05M
                        {
216
3.05M
                            *(csbf + block_col) = 1;
217
3.05M
                            break;
218
3.05M
                        }
219
470M
                    }
220
119M
                    if(*(csbf + block_col) == 1)
221
3.05M
                    {
222
                        /* zero_col update *//* temp_zero_col = ~zero_col */
223
3.05M
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
224
                        // zero col can be optimized further. Now clearing the
225
                        // entire 4 bits corresponding to 4 colums of 4x4 block
226
                        // even if any 4x4 csbf is set
227
228
                        /* zero row update */ /* temp_zero_row = ~zero_row */
229
3.05M
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
230
                        // zero row can be optimized further. Now clearing the
231
                        // entire 4 bits corresponding to 4 rows of 4x4 block
232
                        // even if any 4x4 csbf is set
233
234
3.05M
                        break;
235
3.05M
                    }
236
119M
                }
237
238
32.1M
                cbf = cbf || (*(csbf + block_col)); // cbf update
239
32.1M
            }
240
9.30M
            csbf += csbf_strd;
241
9.30M
        }
242
243
4.70M
        *zero_col = ~temp_zero_col; //final zero_col storing
244
4.70M
        *zero_row = ~temp_zero_row; //final zero_row storing
245
4.70M
    }
246
247
4.70M
    return cbf;
248
4.70M
}
249
250
/**
251
 *******************************************************************************
252
 *
253
 * @brief
254
 *  This function performs quantization, followed by Inverse
255
 *  quantization
256
 *
257
 * @par Description:
258
 *  Performs quantization on coeffs
259
 *
260
 * @param[in] pi2_coeffs
261
 *  4x4 Coeffs
262
 *
263
 * @param[in] pi2_quant_coeff
264
 *  Scaling Matrix
265
 *
266
 * @param[out] pi2_dst
267
 *  Output 4x4 coefficients
268
 *
269
 * @param[in] qp_div
270
 *  Quantization parameter / 6
271
 *
272
 * @param[in] qp_rem
273
 *  Quantization parameter % 6
274
 *
275
 * @param[in] src_strd
276
 *  Input stride
277
 *
278
 * @param[in] dst_strd
279
 *  Output Stride
280
 *
281
 * @param[out] csbf
282
 *  coded sub block flag
283
 *
284
 * @param[in] csbf_strd
285
 *  coded sub block flag
286
 *
287
 * @param[out] zero_col
288
 *  zero column flag
289
 *
290
 * @param[out] zero_row
291
 *  zero column flag
292
 *
293
 * @returns  cbf
294
 * coded block flag
295
 *
296
 * @remarks
297
 *  None
298
 *
299
 *******************************************************************************
300
 */
301
302
WORD32 ihevc_quant_iquant
303
    (
304
    WORD16 *pi2_coeffs,
305
    WORD16 *pi2_quant_coeff,
306
    WORD16 *pi2_q_dst,
307
    WORD16 *pi2_iq_dst,
308
    WORD32  trans_size,
309
    WORD32 qp_div,/* qpscaled / 6 */
310
    WORD32 qp_rem,/* qpscaled % 6 */
311
    WORD32 q_add,
312
    WORD32 *pi4_quant_round_factor_0_1,
313
    WORD32 *pi4_quant_round_factor_1_2,
314
    WORD32 src_strd,
315
    WORD32 dst_q_strd,
316
    WORD32 dst_iq_strd,
317
    UWORD8 *csbf,
318
    WORD32 csbf_strd,
319
    WORD32 *zero_col,
320
    WORD32 *zero_row,
321
    WORD16 *pi2_dequant_coeff,
322
    LWORD64 *pi8_cost
323
    )
324
0
{
325
0
    WORD32 i, j;
326
0
    WORD32 log2_size;
327
0
    WORD16 *pi2_q_dst_orig;
328
0
    WORD32 cbf = 0;
329
0
    WORD32 bit_depth,shift_iq;
330
0
    WORD16 i2_temp;
331
332
0
    (void)pi8_cost;
333
0
    (void)pi4_quant_round_factor_0_1;
334
0
    (void)pi4_quant_round_factor_1_2;
335
0
    pi2_q_dst_orig  = pi2_q_dst;
336
337
    /* Quant initialization */
338
0
    GETRANGE(log2_size, trans_size);
339
0
    log2_size -= 1;
340
341
0
    bit_depth = 8;
342
0
    shift_iq = bit_depth + log2_size - 5;
343
344
0
    for(i = 0; i < trans_size; i++)
345
0
    {
346
0
        for(j = 0; j < trans_size; j++)
347
0
        {
348
            /*  Back up the coefficients before Quantization    */
349
0
            i2_temp = pi2_coeffs[j];
350
351
            /*  Quantization    */
352
0
            QUANT(pi2_q_dst[j], pi2_coeffs[j],
353
0
                  pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
354
0
                  log2_size, q_add);
355
356
            /*  Inverse Quantization    */
357
0
            IQUANT(pi2_iq_dst[j],
358
0
                   pi2_q_dst[j], /*pi2_src[index*src_strd]*/
359
0
                   pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
360
0
                   shift_iq,
361
0
                   qp_div);
362
0
        }
363
364
0
        pi2_q_dst   += dst_q_strd;
365
0
        pi2_iq_dst  += dst_iq_strd;
366
0
        pi2_quant_coeff += trans_size;
367
0
        pi2_coeffs += src_strd;
368
0
        pi2_dequant_coeff += trans_size;
369
0
    }
370
371
    /* CSBF update */
372
0
    {
373
0
        WORD32 block_row, block_col;
374
0
        WORD32 row, col;
375
0
        WORD16 *pi2_block;
376
0
        UWORD32 temp_zero_col = 0;
377
0
        UWORD32 temp_zero_row = 0;
378
379
0
        pi2_q_dst = pi2_q_dst_orig;
380
381
0
        for(block_row = 0; block_row < trans_size; block_row += 4)
382
0
        {
383
            //block_col is incrementing by 1 for easy update of csbf pointer
384
0
            for(block_col = 0; block_col < trans_size / 4; block_col++)
385
0
            {
386
0
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
387
0
                *(csbf + block_col) = 0;
388
389
0
                for(row = 0; row < 4; row++)
390
0
                {
391
0
                    for(col = 0; col < 4; col++)
392
0
                    {
393
0
                        if(pi2_block[row * dst_q_strd + col] != 0)
394
0
                        {
395
0
                            *(csbf + block_col) = 1;
396
0
                            break;
397
0
                        }
398
0
                    }
399
0
                    if(*(csbf + block_col) == 1)
400
0
                    {
401
                        /* zero_col update *//* temp_zero_col = ~zero_col */
402
0
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
403
                        // zero col can be optimized further. Now clearing the
404
                        // entire 4 bits corresponding to 4 colums of 4x4 block
405
                        // even if any 4x4 csbf is set
406
407
                        /* zero row update */ /* temp_zero_row = ~zero_row */
408
0
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
409
                        // zero row can be optimized further. Now clearing the
410
                        // entire 4 bits corresponding to 4 rows of 4x4 block
411
                        // even if any 4x4 csbf is set
412
413
0
                        break;
414
0
                    }
415
0
                }
416
417
0
                cbf = cbf || (*(csbf + block_col)); // cbf update
418
0
            }
419
420
0
            csbf += csbf_strd;
421
0
        }
422
423
0
        *zero_col = ~temp_zero_col; //final zero_col storing
424
0
        *zero_row = ~temp_zero_row; //final zero_row storing
425
0
    }
426
427
0
    return cbf;
428
0
}
429
430
/**
431
 *******************************************************************************
432
 *
433
 * @brief
434
 *  This function performs quantization, followed by Inverse
435
 *  quantization to find transform domain SSD
436
 *
437
 * @par Description:
438
 *  Performs quantization on coeffs
439
 *
440
 * @param[in] pi2_coeffs
441
 *  4x4 Coeffs
442
 *
443
 * @param[in] pi2_quant_coeff
444
 *  Scaling Matrix
445
 *
446
 * @param[out] pi2_dst
447
 *  Output 4x4 coefficients
448
 *
449
 * @param[in] qp_div
450
 *  Quantization parameter / 6
451
 *
452
 * @param[in] qp_rem
453
 *  Quantization parameter % 6
454
 *
455
 * @param[in] src_strd
456
 *  Input stride
457
 *
458
 * @param[in] dst_strd
459
 *  Output Stride
460
 *
461
 * @param[out] csbf
462
 *  coded sub block flag
463
 *
464
 * @param[in] csbf_strd
465
 *  coded sub block flag
466
 *
467
 * @param[out] zero_col
468
 *  zero column flag
469
 *
470
 * @param[out] zero_row
471
 *  zero column flag
472
 *
473
 * @returns  cbf
474
 * coded block flag
475
 *
476
 * @remarks
477
 *  None
478
 *
479
 *******************************************************************************
480
 */
481
482
WORD32 ihevc_quant_iquant_ssd_rdoq
483
    (
484
    WORD16 *pi2_coeffs,
485
    WORD16 *pi2_quant_coeff,
486
    WORD16 *pi2_q_dst,
487
    WORD16 *pi2_iq_dst,
488
    WORD32  trans_size,
489
    WORD32 qp_div,/* qpscaled / 6 */
490
    WORD32 qp_rem,/* qpscaled % 6 */
491
    WORD32 q_add,
492
    WORD32 *pi4_quant_round_factor_0_1,
493
    WORD32 *pi4_quant_round_factor_1_2,
494
    WORD32 src_strd,
495
    WORD32 dst_q_strd,
496
    WORD32 dst_iq_strd,
497
    UWORD8 *csbf,
498
    WORD32 csbf_strd,
499
    WORD32 *zero_col,
500
    WORD32 *zero_row,
501
    WORD16 *pi2_dequant_coeff,
502
    LWORD64 *pi8_cost
503
    )
504
0
{
505
0
    WORD32 i, j;
506
0
    WORD32 log2_size;
507
0
    WORD16 *pi2_q_dst_orig;
508
0
    WORD32 cbf = 0;
509
0
    WORD32 bit_depth,shift_iq;
510
0
    WORD32 val;
511
0
    WORD16 i2_temp;
512
0
    WORD32 ssd_cost = 0;
513
514
0
    (void)pi4_quant_round_factor_0_1;
515
0
    (void)pi4_quant_round_factor_1_2;
516
0
    pi2_q_dst_orig  = pi2_q_dst;
517
518
0
    GETRANGE(log2_size, trans_size);
519
0
    log2_size -= 1;
520
521
0
    bit_depth = 8 + 0;
522
0
    shift_iq = bit_depth + log2_size - 5;
523
524
0
    for(i = 0; i < trans_size; i++)
525
0
    {
526
0
        for(j = 0; j < trans_size; j++)
527
0
        {
528
            /*  Back up the coefficients before Quantization    */
529
0
            i2_temp = pi2_coeffs[j];
530
531
            /*  Quantization    */
532
0
            QUANT(pi2_q_dst[j], pi2_coeffs[j],
533
0
                pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
534
0
                log2_size, q_add);
535
536
537
0
            if (abs(pi2_q_dst[j]) > 1)
538
0
            {
539
0
                QUANT(pi2_q_dst[j],i2_temp,
540
0
                    pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
541
0
                    log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
542
543
0
            }
544
545
546
            /*  Inverse Quantization    */
547
0
            IQUANT(pi2_iq_dst[j],
548
0
                pi2_q_dst[j], /*pi2_src[index*src_strd]*/
549
0
                pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
550
                /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
551
0
                shift_iq,
552
0
                qp_div);
553
554
            /*  SSD Computation & Accumulation  */
555
0
            val = i2_temp - pi2_iq_dst[j];
556
0
            ssd_cost += val*val;
557
558
0
        }
559
560
0
        pi2_q_dst   += dst_q_strd;
561
0
        pi2_iq_dst  += dst_iq_strd;
562
0
        pi2_quant_coeff += trans_size;
563
0
        pi2_coeffs += src_strd;
564
0
        pi2_dequant_coeff += trans_size;
565
0
    }
566
    /* Store the cost */
567
0
    *pi8_cost = ssd_cost;
568
569
    /* CSBF update */
570
0
    {
571
0
        WORD32 block_row, block_col;
572
0
        WORD32 row, col;
573
0
        WORD16 *pi2_block;
574
0
        UWORD32 temp_zero_col = 0;
575
0
        UWORD32 temp_zero_row = 0;
576
577
0
        pi2_q_dst = pi2_q_dst_orig;
578
579
0
        for(block_row = 0; block_row < trans_size; block_row += 4)
580
0
        {
581
            //block_col is incrementing by 1 for easy update of csbf pointer
582
0
            for(block_col = 0; block_col < trans_size / 4; block_col++)
583
0
            {
584
0
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
585
0
                *(csbf + block_col) = 0;
586
587
0
                for(row = 0; row < 4; row++)
588
0
                {
589
0
                    for(col = 0; col < 4; col++)
590
0
                    {
591
0
                        if(pi2_block[row * dst_q_strd + col] != 0)
592
0
                        {
593
0
                            *(csbf + block_col) = 1;
594
0
                            break;
595
0
                        }
596
0
                    }
597
0
                    if(*(csbf + block_col) == 1)
598
0
                    {
599
                        /* zero_col update *//* temp_zero_col = ~zero_col */
600
0
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
601
                        // zero col can be optimized further. Now clearing the
602
                        // entire 4 bits corresponding to 4 colums of 4x4 block
603
                        // even if any 4x4 csbf is set
604
605
                        /* zero row update */ /* temp_zero_row = ~zero_row */
606
0
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
607
                        // zero row can be optimized further. Now clearing the
608
                        // entire 4 bits corresponding to 4 rows of 4x4 block
609
                        // even if any 4x4 csbf is set
610
611
0
                        break;
612
0
                    }
613
0
                }
614
615
0
                cbf = cbf || (*(csbf + block_col)); // cbf update
616
0
            }
617
0
            csbf += csbf_strd;
618
0
        }
619
620
0
        *zero_col = ~temp_zero_col; //final zero_col storing
621
0
        *zero_row = ~temp_zero_row; //final zero_row storing
622
0
    }
623
624
0
    return cbf;
625
0
}
626
627
WORD32 ihevc_quant_iquant_rdoq
628
    (
629
    WORD16 *pi2_coeffs,
630
    WORD16 *pi2_quant_coeff,
631
    WORD16 *pi2_q_dst,
632
    WORD16 *pi2_iq_dst,
633
    WORD32  trans_size,
634
    WORD32 qp_div,/* qpscaled / 6 */
635
    WORD32 qp_rem,/* qpscaled % 6 */
636
    WORD32 q_add,
637
    WORD32 *pi4_quant_round_factor_0_1,
638
    WORD32 *pi4_quant_round_factor_1_2,
639
    WORD32 src_strd,
640
    WORD32 dst_q_strd,
641
    WORD32 dst_iq_strd,
642
    UWORD8 *csbf,
643
    WORD32 csbf_strd,
644
    WORD32 *zero_col,
645
    WORD32 *zero_row,
646
    WORD16 *pi2_dequant_coeff,
647
    LWORD64 *pi8_cost
648
    )
649
0
{
650
0
    WORD32 i, j;
651
0
    WORD32 log2_size;
652
0
    WORD16 *pi2_q_dst_orig;
653
0
    WORD32 cbf = 0;
654
0
    WORD32 bit_depth,shift_iq;
655
0
    WORD16 i2_temp;
656
657
0
    (void)pi8_cost;
658
0
    (void)pi4_quant_round_factor_0_1;
659
0
    (void)pi4_quant_round_factor_1_2;
660
0
    pi2_q_dst_orig  = pi2_q_dst;
661
662
0
    GETRANGE(log2_size, trans_size);
663
0
    log2_size -= 1;
664
665
0
    bit_depth = 8 + 0;
666
0
    shift_iq = bit_depth + log2_size - 5;
667
668
0
    for(i = 0; i < trans_size; i++)
669
0
    {
670
0
        for(j = 0; j < trans_size; j++)
671
0
        {
672
            /*  Back up the coefficients before Quantization    */
673
0
            i2_temp = pi2_coeffs[j];
674
675
            /*  Quantization    */
676
0
            QUANT(pi2_q_dst[j], pi2_coeffs[j],
677
0
                pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
678
0
                log2_size, q_add);
679
680
0
            if (abs(pi2_q_dst[j]) > 1)
681
0
            {
682
0
                QUANT(pi2_q_dst[j],i2_temp,
683
0
                    pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
684
0
                    log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
685
0
            }
686
687
            /*  Inverse Quantization    */
688
0
            IQUANT(pi2_iq_dst[j],
689
0
                pi2_q_dst[j], /*pi2_src[index*src_strd]*/
690
0
                pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
691
0
                shift_iq,
692
0
                qp_div);
693
0
        }
694
695
0
        pi2_q_dst   += dst_q_strd;
696
0
        pi2_iq_dst  += dst_iq_strd;
697
0
        pi2_quant_coeff += trans_size;
698
0
        pi2_coeffs += src_strd;
699
0
        pi2_dequant_coeff += trans_size;
700
0
    }
701
702
    /* CSBF update */
703
0
    {
704
0
        WORD32 block_row, block_col;
705
0
        WORD32 row, col;
706
0
        WORD16 *pi2_block;
707
0
        UWORD32 temp_zero_col = 0;
708
0
        UWORD32 temp_zero_row = 0;
709
710
0
        pi2_q_dst = pi2_q_dst_orig;
711
712
0
        for(block_row = 0; block_row < trans_size; block_row += 4)
713
0
        {
714
            //block_col is incrementing by 1 for easy update of csbf pointer
715
0
            for(block_col = 0; block_col < trans_size / 4; block_col++)
716
0
            {
717
0
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
718
0
                *(csbf + block_col) = 0;
719
720
0
                for(row = 0; row < 4; row++)
721
0
                {
722
0
                    for(col = 0; col < 4; col++)
723
0
                    {
724
0
                        if(pi2_block[row * dst_q_strd + col] != 0)
725
0
                        {
726
0
                            *(csbf + block_col) = 1;
727
0
                            break;
728
0
                        }
729
0
                    }
730
0
                    if(*(csbf + block_col) == 1)
731
0
                    {
732
                        /* zero_col update *//* temp_zero_col = ~zero_col */
733
0
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
734
                        // zero col can be optimized further. Now clearing the
735
                        // entire 4 bits corresponding to 4 colums of 4x4 block
736
                        // even if any 4x4 csbf is set
737
738
                        /* zero row update */ /* temp_zero_row = ~zero_row */
739
0
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
740
                        // zero row can be optimized further. Now clearing the
741
                        // entire 4 bits corresponding to 4 rows of 4x4 block
742
                        // even if any 4x4 csbf is set
743
744
0
                        break;
745
0
                    }
746
0
                }
747
748
0
                cbf = cbf || (*(csbf + block_col)); // cbf update
749
0
            }
750
0
            csbf += csbf_strd;
751
0
        }
752
753
0
        *zero_col = ~temp_zero_col; //final zero_col storing
754
0
        *zero_row = ~temp_zero_row; //final zero_row storing
755
0
    }
756
757
0
    return cbf;
758
0
}
759
760
/**
761
 *******************************************************************************
762
 *
763
 * @brief
764
 *  This function performs quantization(using flat scale matrix), followed by
765
 *  inverse quantization to find transform domain SSD
766
 *
767
 * @par Description:
768
 *  Performs quantization on coeffs
769
 *
770
 * @param[in] pi2_coeffs
771
 *  4x4 Coeffs
772
 *
773
 * @param[in] pi2_quant_coeff
774
 *  Scaling Matrix
775
 *
776
 * @param[out] pi2_dst
777
 *  Output 4x4 coefficients
778
 *
779
 * @param[in] qp_div
780
 *  Quantization parameter / 6
781
 *
782
 * @param[in] qp_rem
783
 *  Quantization parameter % 6
784
 *
785
 * @param[in] src_strd
786
 *  Input stride
787
 *
788
 * @param[in] dst_strd
789
 *  Output Stride
790
 *
791
 * @param[out] csbf
792
 *  coded sub block flag
793
 *
794
 * @param[in] csbf_strd
795
 *  coded sub block flag
796
 *
797
 * @param[out] zero_col
798
 *  zero column flag
799
 *
800
 * @param[out] zero_row
801
 *  zero column flag
802
 *
803
 * @returns  cbf
804
 * coded block flag
805
 *
806
 * @remarks
807
 *  None
808
 *
809
 *******************************************************************************
810
 */
811
812
WORD32 ihevc_quant_iquant_ssd_flat_scale_mat
813
    (
814
    WORD16 *pi2_coeffs,
815
    WORD16 *pi2_quant_coeff,
816
    WORD16 *pi2_q_dst,
817
    WORD16 *pi2_iq_dst,
818
    WORD32  trans_size,
819
    WORD32 qp_div,/* qpscaled / 6 */
820
    WORD32 qp_rem,/* qpscaled % 6 */
821
    WORD32 q_add,
822
    WORD32 *pi4_quant_round_factor_0_1,
823
    WORD32 *pi4_quant_round_factor_1_2,
824
    WORD32 src_strd,
825
    WORD32 dst_q_strd,
826
    WORD32 dst_iq_strd,
827
    UWORD8 *csbf,
828
    WORD32 csbf_strd,
829
    WORD32 *zero_col,
830
    WORD32 *zero_row,
831
    WORD16 *pi2_dequant_coeff,
832
    LWORD64 *pi8_cost
833
    )
834
9.62M
{
835
9.62M
    WORD32 i, j;
836
9.62M
    WORD32 log2_size;
837
9.62M
    WORD16 *pi2_q_dst_orig;
838
9.62M
    WORD32 cbf = 0;
839
9.62M
    WORD32 bit_depth,shift_iq;
840
9.62M
    WORD32 val;
841
9.62M
    WORD16 i2_temp;
842
    /* Initialize cost to zero */
843
9.62M
    WORD32 ssd_cost = 0;
844
845
9.62M
    (void)pi4_quant_round_factor_0_1;
846
9.62M
    (void)pi4_quant_round_factor_1_2;
847
9.62M
    pi2_q_dst_orig  = pi2_q_dst;
848
849
    /* Quant initialization */
850
9.62M
    GETRANGE(log2_size, trans_size);
851
9.62M
    log2_size -= 1;
852
853
9.62M
    bit_depth = 8 + 0;
854
9.62M
    shift_iq = bit_depth + log2_size - 5;
855
856
92.6M
    for(i = 0; i < trans_size; i++)
857
83.0M
    {
858
1.24G
        for(j = 0; j < trans_size; j++)
859
1.16G
        {
860
            /*  Back up the coefficients before Quantization    */
861
1.16G
            i2_temp = pi2_coeffs[j];
862
863
            /*QUANT(pi2_dst[j], pi2_coeffs[j],
864
            pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
865
            log2_size, q_add);*/
866
867
            /* modified by 1028 */
868
            /*  Quantization    */
869
1.16G
            QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
870
1.16G
                  g_ihevc_quant_scales[qp_rem], qp_div,
871
1.16G
                  log2_size, q_add);
872
873
1.16G
            if(pi2_q_dst[j] == 0)
874
1.10G
            {
875
1.10G
                pi2_iq_dst[j] = 0;
876
1.10G
            }
877
60.3M
            else
878
60.3M
            {
879
            /*  Inverse Quantization    */
880
60.3M
            IQUANT(pi2_iq_dst[j],
881
60.3M
                    pi2_q_dst[j], /*pi2_src[index*src_strd]*/
882
60.3M
                    pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
883
60.3M
                    shift_iq,
884
60.3M
                    qp_div);
885
60.3M
            }
886
887
            /*  SSD Computation & Accumulation  */
888
1.16G
            val = i2_temp - pi2_iq_dst[j];
889
1.16G
            ssd_cost += val*val;
890
891
1.16G
        }
892
893
83.0M
        pi2_q_dst   += dst_q_strd;
894
83.0M
        pi2_iq_dst  += dst_iq_strd;
895
83.0M
        pi2_quant_coeff += trans_size;
896
83.0M
        pi2_coeffs += src_strd;
897
83.0M
        pi2_dequant_coeff += trans_size;
898
83.0M
    }
899
    /* Store the cost */
900
9.62M
    *pi8_cost = ssd_cost;
901
902
    /* CSBF update */
903
9.62M
    {
904
9.62M
        WORD32 block_row, block_col;
905
9.62M
        WORD32 row, col;
906
9.62M
        WORD16 *pi2_block;
907
9.62M
        UWORD32 temp_zero_col = 0;
908
9.62M
        UWORD32 temp_zero_row = 0;
909
910
9.62M
        pi2_q_dst = pi2_q_dst_orig;
911
912
30.3M
        for(block_row = 0; block_row < trans_size; block_row += 4)
913
20.7M
        {
914
            //block_col is incrementing by 1 for easy update of csbf pointer
915
93.3M
            for(block_col = 0; block_col < trans_size / 4; block_col++)
916
72.5M
            {
917
72.5M
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
918
72.5M
                *(csbf + block_col) = 0;
919
920
343M
                for(row = 0; row < 4; row++)
921
275M
                {
922
1.35G
                    for(col = 0; col < 4; col++)
923
1.08G
                    {
924
1.08G
                        if(pi2_block[row * dst_q_strd + col] != 0)
925
5.10M
                        {
926
5.10M
                            *(csbf + block_col) = 1;
927
5.10M
                            break;
928
5.10M
                        }
929
1.08G
                    }
930
275M
                    if(*(csbf + block_col) == 1)
931
5.10M
                    {
932
                        /* zero_col update *//* temp_zero_col = ~zero_col */
933
5.10M
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
934
                        // zero col can be optimized further. Now clearing the
935
                        // entire 4 bits corresponding to 4 colums of 4x4 block
936
                        // even if any 4x4 csbf is set
937
938
                        /* zero row update */ /* temp_zero_row = ~zero_row */
939
5.10M
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
940
                        // zero row can be optimized further. Now clearing the
941
                        // entire 4 bits corresponding to 4 rows of 4x4 block
942
                        // even if any 4x4 csbf is set
943
944
5.10M
                        break;
945
5.10M
                    }
946
275M
                }
947
948
72.5M
                cbf = cbf || (*(csbf + block_col)); // cbf update
949
72.5M
            }
950
20.7M
            csbf += csbf_strd;
951
20.7M
        }
952
953
9.62M
        *zero_col = ~temp_zero_col; //final zero_col storing
954
9.62M
        *zero_row = ~temp_zero_row; //final zero_row storing
955
9.62M
    }
956
957
9.62M
    return cbf;
958
9.62M
}
959
960
WORD32 ihevc_quant_iquant_flat_scale_mat
961
    (
962
    WORD16 *pi2_coeffs,
963
    WORD16 *pi2_quant_coeff,
964
    WORD16 *pi2_q_dst,
965
    WORD16 *pi2_iq_dst,
966
    WORD32  trans_size,
967
    WORD32 qp_div,/* qpscaled / 6 */
968
    WORD32 qp_rem,/* qpscaled % 6 */
969
    WORD32 q_add,
970
    WORD32 *pi4_quant_round_factor_0_1,
971
    WORD32 *pi4_quant_round_factor_1_2,
972
    WORD32 src_strd,
973
    WORD32 dst_q_strd,
974
    WORD32 dst_iq_strd,
975
    UWORD8 *csbf,
976
    WORD32 csbf_strd,
977
    WORD32 *zero_col,
978
    WORD32 *zero_row,
979
    WORD16 *pi2_dequant_coeff,
980
    LWORD64 *pi8_cost
981
    )
982
0
{
983
0
    WORD32 i, j;
984
0
    WORD32 log2_size;
985
0
    WORD16 *pi2_q_dst_orig;
986
0
    WORD32 cbf = 0;
987
0
    WORD32 bit_depth,shift_iq;
988
0
    WORD16 i2_temp;
989
990
0
    (void)pi8_cost;
991
0
    (void)pi4_quant_round_factor_0_1;
992
0
    (void)pi4_quant_round_factor_1_2;
993
0
    pi2_q_dst_orig  = pi2_q_dst;
994
995
    /* Quant initialization */
996
0
    GETRANGE(log2_size, trans_size);
997
0
    log2_size -= 1;
998
999
0
    bit_depth = 8 + 0;
1000
0
    shift_iq = bit_depth + log2_size - 5;
1001
1002
0
    for(i = 0; i < trans_size; i++)
1003
0
    {
1004
0
        for(j = 0; j < trans_size; j++)
1005
0
        {
1006
            /*  Back up the coefficients before Quantization    */
1007
0
            i2_temp = pi2_coeffs[j];
1008
1009
            /*  Quantization    */
1010
0
            QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1011
0
                  g_ihevc_quant_scales[qp_rem], qp_div,
1012
0
                  log2_size, q_add);
1013
1014
0
            if(pi2_q_dst[j] == 0)
1015
0
            {
1016
0
                pi2_iq_dst[j] = 0;
1017
0
            }
1018
0
            else
1019
0
            {
1020
            /*  Inverse Quantization    */
1021
0
            IQUANT(pi2_iq_dst[j],
1022
0
                    pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1023
0
                    pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1024
0
                    shift_iq,
1025
0
                    qp_div);
1026
0
            }
1027
0
        }
1028
1029
0
        pi2_q_dst   += dst_q_strd;
1030
0
        pi2_iq_dst  += dst_iq_strd;
1031
0
        pi2_quant_coeff += trans_size;
1032
0
        pi2_coeffs += src_strd;
1033
0
        pi2_dequant_coeff += trans_size;
1034
0
    }
1035
1036
    /* CSBF update */
1037
0
    {
1038
0
        WORD32 block_row, block_col;
1039
0
        WORD32 row, col;
1040
0
        WORD16 *pi2_block;
1041
0
        UWORD32 temp_zero_col = 0;
1042
0
        UWORD32 temp_zero_row = 0;
1043
1044
0
        pi2_q_dst = pi2_q_dst_orig;
1045
1046
0
        for(block_row = 0; block_row < trans_size; block_row += 4)
1047
0
        {
1048
            //block_col is incrementing by 1 for easy update of csbf pointer
1049
0
            for(block_col = 0; block_col < trans_size / 4; block_col++)
1050
0
            {
1051
0
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1052
0
                *(csbf + block_col) = 0;
1053
1054
0
                for(row = 0; row < 4; row++)
1055
0
                {
1056
0
                    for(col = 0; col < 4; col++)
1057
0
                    {
1058
0
                        if(pi2_block[row * dst_q_strd + col] != 0)
1059
0
                        {
1060
0
                            *(csbf + block_col) = 1;
1061
0
                            break;
1062
0
                        }
1063
0
                    }
1064
0
                    if(*(csbf + block_col) == 1)
1065
0
                    {
1066
                        /* zero_col update *//* temp_zero_col = ~zero_col */
1067
0
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1068
                        // zero col can be optimized further. Now clearing the
1069
                        // entire 4 bits corresponding to 4 colums of 4x4 block
1070
                        // even if any 4x4 csbf is set
1071
1072
                        /* zero row update */ /* temp_zero_row = ~zero_row */
1073
0
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1074
                        // zero row can be optimized further. Now clearing the
1075
                        // entire 4 bits corresponding to 4 rows of 4x4 block
1076
                        // even if any 4x4 csbf is set
1077
1078
0
                        break;
1079
0
                    }
1080
0
                }
1081
1082
0
                cbf = cbf || (*(csbf + block_col)); // cbf update
1083
0
            }
1084
0
            csbf += csbf_strd;
1085
0
        }
1086
1087
0
        *zero_col = ~temp_zero_col; //final zero_col storing
1088
0
        *zero_row = ~temp_zero_row; //final zero_row storing
1089
0
    }
1090
1091
0
    return cbf;
1092
0
}
1093
1094
/**
1095
 *******************************************************************************
1096
 *
1097
 * @brief
1098
 *  This function performs quantization(using flat scale matrix), followed by
1099
 *  inverse quantization to find transform domain SSD; when we perform RDOQ.
1100
 *  In case the quantized value turns out to be grater than 1, we then requantize
1101
 *  use half rounding.
1102
 *
1103
 * @par Description:
1104
 *  Performs quantization on coeffs
1105
 *
1106
 * @param[in] pi2_coeffs
1107
 *  4x4 Coeffs
1108
 *
1109
 * @param[in] pi2_quant_coeff
1110
 *  Scaling Matrix
1111
 *
1112
 * @param[out] pi2_dst
1113
 *  Output 4x4 coefficients
1114
 *
1115
 * @param[in] qp_div
1116
 *  Quantization parameter / 6
1117
 *
1118
 * @param[in] qp_rem
1119
 *  Quantization parameter % 6
1120
 *
1121
 * @param[in] src_strd
1122
 *  Input stride
1123
 *
1124
 * @param[in] dst_strd
1125
 *  Output Stride
1126
 *
1127
 * @param[out] csbf
1128
 *  coded sub block flag
1129
 *
1130
 * @param[in] csbf_strd
1131
 *  coded sub block flag
1132
 *
1133
 * @param[out] zero_col
1134
 *  zero column flag
1135
 *
1136
 * @param[out] zero_row
1137
 *  zero column flag
1138
 *
1139
 * @returns  cbf
1140
 * coded block flag
1141
 *
1142
 * @remarks
1143
 *  None
1144
 *
1145
 *******************************************************************************
1146
 */
1147
1148
WORD32 ihevc_quant_iquant_ssd_flat_scale_mat_rdoq
1149
    (
1150
    WORD16 *pi2_coeffs,
1151
    WORD16 *pi2_quant_coeff,
1152
    WORD16 *pi2_q_dst,
1153
    WORD16 *pi2_iq_dst,
1154
    WORD32  trans_size,
1155
    WORD32 qp_div,/* qpscaled / 6 */
1156
    WORD32 qp_rem,/* qpscaled % 6 */
1157
    WORD32 q_add,
1158
    WORD32 *pi4_quant_round_factor_0_1,
1159
    WORD32 *pi4_quant_round_factor_1_2,
1160
    WORD32 src_strd,
1161
    WORD32 dst_q_strd,
1162
    WORD32 dst_iq_strd,
1163
    UWORD8 *csbf,
1164
    WORD32 csbf_strd,
1165
    WORD32 *zero_col,
1166
    WORD32 *zero_row,
1167
    WORD16 *pi2_dequant_coeff,
1168
    LWORD64 *pi8_cost
1169
    )
1170
0
{
1171
0
    WORD32 i, j;
1172
0
    WORD32 log2_size;
1173
0
    WORD16 *pi2_q_dst_orig;
1174
0
    WORD32 cbf = 0;
1175
0
    WORD32 bit_depth,shift_iq;
1176
0
    WORD32 val;
1177
0
    WORD16 i2_temp;
1178
    /* Initialize cost to zero */
1179
0
    WORD32 ssd_cost = 0;
1180
1181
0
    (void)pi4_quant_round_factor_0_1;
1182
0
    (void)pi4_quant_round_factor_1_2;
1183
0
    pi2_q_dst_orig  = pi2_q_dst;
1184
1185
    /* Quant initialization */
1186
0
    GETRANGE(log2_size, trans_size);
1187
0
    log2_size -= 1;
1188
1189
0
    bit_depth = 8 + 0;
1190
0
    shift_iq = bit_depth + log2_size - 5;
1191
1192
0
    for(i = 0; i < trans_size; i++)
1193
0
    {
1194
0
        for(j = 0; j < trans_size; j++)
1195
0
        {
1196
0
            WORD16 i2_temp1;
1197
            /*  Back up the coefficients before Quantization    */
1198
0
            i2_temp = pi2_coeffs[j];
1199
1200
            /*QUANT(pi2_dst[j], pi2_coeffs[j],
1201
            pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1202
            log2_size, q_add);*/
1203
1204
            /* modified by 1028 */
1205
            /*  Quantization    */
1206
1207
0
            if (1)
1208
0
            {
1209
0
                QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1210
0
                  g_ihevc_quant_scales[qp_rem], qp_div,
1211
0
                  log2_size, q_add);
1212
0
            }
1213
0
            else
1214
0
            {                                                                                                                                                                \
1215
0
                WORD16 inp = pi2_coeffs[j],out = pi2_q_dst[j];
1216
0
                WORD32 quant_coeff = g_ihevc_quant_scales[qp_rem];
1217
0
                WORD32 log2_trans_size = log2_size;
1218
0
                WORD32 tmp;                                                                                                                                                  \
1219
0
                WORD32 sign;                                                                                                                                                 \
1220
0
                WORD32 bit_depth,transform_shift;                                                                                                                            \
1221
0
                WORD32  q_bits, quant_multiplier;                                                                                                                            \
1222
0
                                                                                                                                                                                \
1223
                /* q_bits and q_add calculation*/                                                                                                                            \
1224
                /* To be moved outside in neon. To be computer once per transform call */                                                                                    \
1225
0
                bit_depth = 8;                                                                                                                                               \
1226
0
                transform_shift = MAX_TR_DYNAMIC_RANGE - bit_depth - log2_trans_size;                                                                                        \
1227
0
                quant_multiplier = 4 ; /* because quant_coeff are multiplied by 16. Instead of multiplying, we can reduce the division factor q_bits by 4 */                 \
1228
0
                q_bits = QUANT_SHIFT + qp_div + transform_shift + SCALING_Q_SHIFT - quant_multiplier - FLAT_RESCALE_MAT_Q_SHIFT /* 2048 */;                                                                       \
1229
0
                                                                                                                                                                                \
1230
0
                sign = (inp)<0 ? -1:1;                                                                                                                                       \
1231
0
                                                                                                                                                                                \
1232
0
                tmp = (WORD32)(abs(inp));                                                                                                                                    \
1233
0
                tmp = tmp * (quant_coeff);                                                                                                                                   \
1234
0
                tmp = tmp + (((WORD32)q_add) << (q_bits - QUANT_ROUND_FACTOR_Q));                                                                                            \
1235
0
                tmp = tmp >> q_bits;                                                                                                                                         \
1236
0
                                                                                                                                                                                \
1237
0
                tmp = tmp * sign;                                                                                                                                            \
1238
0
                out = (WORD16) CLIP_S16(tmp);                                                                                                                                \
1239
0
            }
1240
0
            i2_temp1 = pi2_q_dst[j];
1241
0
            if (abs(pi2_q_dst[j]) > 1)
1242
0
            {
1243
0
                QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
1244
0
                  g_ihevc_quant_scales[qp_rem], qp_div,
1245
0
                  log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1246
0
            }
1247
1248
1249
0
            ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
1250
0
            ASSERT(abs(i2_temp1) <= abs(pi2_q_dst[j]));
1251
1252
1253
            /*  Inverse Quantization    */
1254
0
            IQUANT(pi2_iq_dst[j],
1255
0
                    pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1256
0
                    pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1257
0
                    shift_iq,
1258
0
                    qp_div);
1259
1260
            /*  SSD Computation & Accumulation  */
1261
0
            val = i2_temp - pi2_iq_dst[j];
1262
0
            ssd_cost += val*val;
1263
1264
0
        }
1265
1266
0
        pi2_q_dst   += dst_q_strd;
1267
0
        pi2_iq_dst  += dst_iq_strd;
1268
0
        pi2_quant_coeff += trans_size;
1269
0
        pi2_coeffs += src_strd;
1270
0
        pi2_dequant_coeff += trans_size;
1271
1272
0
    }
1273
    /* Store the cost */
1274
0
    *pi8_cost = ssd_cost;
1275
1276
    /* CSBF update */
1277
0
    {
1278
0
        WORD32 block_row, block_col;
1279
0
        WORD32 row, col;
1280
0
        WORD16 *pi2_block;
1281
0
        UWORD32 temp_zero_col = 0;
1282
0
        UWORD32 temp_zero_row = 0;
1283
1284
0
        pi2_q_dst = pi2_q_dst_orig;
1285
1286
0
        for(block_row = 0; block_row < trans_size; block_row += 4)
1287
0
        {
1288
            //block_col is incrementing by 1 for easy update of csbf pointer
1289
0
            for(block_col = 0; block_col < trans_size / 4; block_col++)
1290
0
            {
1291
0
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1292
0
                *(csbf + block_col) = 0;
1293
1294
0
                for(row = 0; row < 4; row++)
1295
0
                {
1296
0
                    for(col = 0; col < 4; col++)
1297
0
                    {
1298
0
                        if(pi2_block[row * dst_q_strd + col] != 0)
1299
0
                        {
1300
0
                            *(csbf + block_col) = 1;
1301
0
                            break;
1302
0
                        }
1303
0
                    }
1304
0
                    if(*(csbf + block_col) == 1)
1305
0
                    {
1306
                        /* zero_col update *//* temp_zero_col = ~zero_col */
1307
0
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1308
                        // zero col can be optimized further. Now clearing the
1309
                        // entire 4 bits corresponding to 4 colums of 4x4 block
1310
                        // even if any 4x4 csbf is set
1311
1312
                        /* zero row update */ /* temp_zero_row = ~zero_row */
1313
0
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1314
                        // zero row can be optimized further. Now clearing the
1315
                        // entire 4 bits corresponding to 4 rows of 4x4 block
1316
                        // even if any 4x4 csbf is set
1317
1318
0
                        break;
1319
0
                    }
1320
0
                }
1321
1322
0
                cbf = cbf || (*(csbf + block_col)); // cbf update
1323
0
            }
1324
0
            csbf += csbf_strd;
1325
0
        }
1326
1327
0
        *zero_col = ~temp_zero_col; //final zero_col storing
1328
0
        *zero_row = ~temp_zero_row; //final zero_row storing
1329
0
    }
1330
0
    return cbf;
1331
0
}
1332
1333
WORD32 ihevc_quant_iquant_flat_scale_mat_rdoq
1334
    (
1335
    WORD16 *pi2_coeffs,
1336
    WORD16 *pi2_quant_coeff,
1337
    WORD16 *pi2_q_dst,
1338
    WORD16 *pi2_iq_dst,
1339
    WORD32  trans_size,
1340
    WORD32 qp_div,/* qpscaled / 6 */
1341
    WORD32 qp_rem,/* qpscaled % 6 */
1342
    WORD32 q_add,
1343
    WORD32 *pi4_quant_round_factor_0_1,
1344
    WORD32 *pi4_quant_round_factor_1_2,
1345
    WORD32 src_strd,
1346
    WORD32 dst_q_strd,
1347
    WORD32 dst_iq_strd,
1348
    UWORD8 *csbf,
1349
    WORD32 csbf_strd,
1350
    WORD32 *zero_col,
1351
    WORD32 *zero_row,
1352
    WORD16 *pi2_dequant_coeff,
1353
    LWORD64 *pi8_cost
1354
    )
1355
0
{
1356
0
    WORD32 i, j;
1357
0
    WORD32 log2_size;
1358
0
    WORD16 *pi2_q_dst_orig;
1359
0
    WORD32 cbf = 0;
1360
0
    WORD32 bit_depth,shift_iq;
1361
0
    WORD16 i2_temp;
1362
1363
0
    (void)pi8_cost;
1364
0
    (void)pi4_quant_round_factor_0_1;
1365
0
    (void)pi4_quant_round_factor_1_2;
1366
0
    pi2_q_dst_orig  = pi2_q_dst;
1367
1368
    /* Quant initialization */
1369
0
    GETRANGE(log2_size, trans_size);
1370
0
    log2_size -= 1;
1371
1372
0
    bit_depth = 8 + 0;
1373
0
    shift_iq = bit_depth + log2_size - 5;
1374
1375
0
    for(i = 0; i < trans_size; i++)
1376
0
    {
1377
0
        for(j = 0; j < trans_size; j++)
1378
0
        {
1379
0
            WORD16 i2_temp1;
1380
            /*  Back up the coefficients before Quantization    */
1381
0
            i2_temp = pi2_coeffs[j];
1382
1383
0
            QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1384
0
                g_ihevc_quant_scales[qp_rem], qp_div,
1385
0
                log2_size, q_add);
1386
1387
0
            i2_temp1 = pi2_q_dst[j];
1388
1389
0
            if (abs(pi2_q_dst[j]) > 1)
1390
0
            {
1391
0
                QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
1392
0
                    g_ihevc_quant_scales[qp_rem], qp_div,
1393
0
                    log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1394
0
            }
1395
1396
0
            ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
1397
0
            ASSERT(abs(i2_temp1) <= abs(pi2_q_dst[j]));
1398
1399
0
            IQUANT(pi2_iq_dst[j],
1400
0
                pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1401
0
                pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1402
0
                shift_iq,
1403
0
                qp_div);
1404
0
        }
1405
1406
0
        pi2_q_dst   += dst_q_strd;
1407
0
        pi2_iq_dst  += dst_iq_strd;
1408
0
        pi2_quant_coeff += trans_size;
1409
0
        pi2_coeffs += src_strd;
1410
0
        pi2_dequant_coeff += trans_size;
1411
0
    }
1412
1413
    /* CSBF update */
1414
0
    {
1415
0
        WORD32 block_row, block_col;
1416
0
        WORD32 row, col;
1417
0
        WORD16 *pi2_block;
1418
0
        UWORD32 temp_zero_col = 0;
1419
0
        UWORD32 temp_zero_row = 0;
1420
1421
0
        pi2_q_dst = pi2_q_dst_orig;
1422
1423
0
        for(block_row = 0; block_row < trans_size; block_row += 4)
1424
0
        {
1425
            //block_col is incrementing by 1 for easy update of csbf pointer
1426
0
            for(block_col = 0; block_col < trans_size / 4; block_col++)
1427
0
            {
1428
0
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1429
0
                *(csbf + block_col) = 0;
1430
1431
0
                for(row = 0; row < 4; row++)
1432
0
                {
1433
0
                    for(col = 0; col < 4; col++)
1434
0
                    {
1435
0
                        if(pi2_block[row * dst_q_strd + col] != 0)
1436
0
                        {
1437
0
                            *(csbf + block_col) = 1;
1438
0
                            break;
1439
0
                        }
1440
0
                    }
1441
0
                    if(*(csbf + block_col) == 1)
1442
0
                    {
1443
                        /* zero_col update *//* temp_zero_col = ~zero_col */
1444
0
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1445
                        // zero col can be optimized further. Now clearing the
1446
                        // entire 4 bits corresponding to 4 colums of 4x4 block
1447
                        // even if any 4x4 csbf is set
1448
1449
                        /* zero row update */ /* temp_zero_row = ~zero_row */
1450
0
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1451
                        // zero row can be optimized further. Now clearing the
1452
                        // entire 4 bits corresponding to 4 rows of 4x4 block
1453
                        // even if any 4x4 csbf is set
1454
1455
0
                        break;
1456
0
                    }
1457
0
                }
1458
1459
0
                cbf = cbf || (*(csbf + block_col)); // cbf update
1460
0
            }
1461
0
            csbf += csbf_strd;
1462
0
        }
1463
1464
0
        *zero_col = ~temp_zero_col; //final zero_col storing
1465
0
        *zero_row = ~temp_zero_row; //final zero_row storing
1466
0
    }
1467
1468
0
    return cbf;
1469
0
}
1470
1471
1472
/**
1473
*******************************************************************************
1474
*
1475
* @brief
1476
*  This function performs quantization, followed by Inverse
1477
*  quantization to find transform domain SSD
1478
*
1479
* @par Description:
1480
*  Performs quantization on coeffs
1481
*
1482
* @param[in] pi2_coeffs
1483
*  4x4 Coeffs
1484
*
1485
* @param[in] pi2_quant_coeff
1486
*  Scaling Matrix
1487
*
1488
* @param[out] pi2_dst
1489
*  Output 4x4 coefficients
1490
*
1491
* @param[in] qp_div
1492
*  Quantization parameter / 6
1493
*
1494
* @param[in] qp_rem
1495
*  Quantization parameter % 6
1496
*
1497
* @param[in] src_strd
1498
*  Input stride
1499
*
1500
* @param[in] dst_strd
1501
*  Output Stride
1502
*
1503
* @param[out] csbf
1504
*  coded sub block flag
1505
*
1506
* @param[in] csbf_strd
1507
*  coded sub block flag
1508
*
1509
* @param[out] zero_col
1510
*  zero column flag
1511
*
1512
* @param[out] zero_row
1513
*  zero column flag
1514
*
1515
* @returns  cbf
1516
* coded block flag
1517
*
1518
* @remarks
1519
*  None
1520
*
1521
*******************************************************************************
1522
*/
1523
1524
WORD32 ihevc_q_iq_ssd_var_rnd_fact
1525
    (
1526
    WORD16 *pi2_coeffs,
1527
    WORD16 *pi2_quant_coeff,
1528
    WORD16 *pi2_q_dst,
1529
    WORD16 *pi2_iq_dst,
1530
    WORD32  trans_size,
1531
    WORD32 qp_div,/* qpscaled / 6 */
1532
    WORD32 qp_rem,/* qpscaled % 6 */
1533
    WORD32 q_add,
1534
    WORD32 *pi4_quant_round_factor_0_1,
1535
    WORD32 *pi4_quant_round_factor_1_2,
1536
    WORD32 src_strd,
1537
    WORD32 dst_q_strd,
1538
    WORD32 dst_iq_strd,
1539
    UWORD8 *csbf,
1540
    WORD32 csbf_strd,
1541
    WORD32 *zero_col,
1542
    WORD32 *zero_row,
1543
    WORD16 *pi2_dequant_coeff,
1544
    LWORD64 *pi8_cost
1545
    )
1546
7.01M
{
1547
7.01M
    WORD32 i, j;
1548
7.01M
    WORD32 log2_size;
1549
7.01M
    WORD16 *pi2_q_dst_orig;
1550
7.01M
    WORD32 cbf = 0;
1551
7.01M
    WORD32 bit_depth,shift_iq;
1552
7.01M
    WORD32 val;
1553
7.01M
    WORD16 i2_temp;
1554
    //WORD16 i2_temp_1;
1555
    /* Initialize cost to zero */
1556
7.01M
    WORD32 ssd_cost = 0;
1557
1558
7.01M
    (void)q_add;
1559
7.01M
    pi2_q_dst_orig  = pi2_q_dst;
1560
1561
1562
    /* Quant initialization */
1563
7.01M
    GETRANGE(log2_size, trans_size);
1564
7.01M
    log2_size -= 1;
1565
1566
7.01M
    bit_depth = 8 + 0;
1567
7.01M
    shift_iq = bit_depth + log2_size - 5;
1568
1569
64.8M
    for(i = 0; i < trans_size; i++)
1570
57.7M
    {
1571
822M
        for(j = 0; j < trans_size; j++)
1572
764M
        {
1573
            /*  Back up the coefficients before Quantization    */
1574
764M
            i2_temp = pi2_coeffs[j];
1575
1576
1577
764M
            {
1578
                /*  Quantization    */
1579
764M
                QUANT(pi2_q_dst[j],i2_temp,
1580
764M
                    pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1581
764M
                    log2_size, 0);
1582
764M
                if (abs(pi2_q_dst[j]) >= 2)
1583
29.2M
                {
1584
29.2M
                    QUANT(pi2_q_dst[j],i2_temp,
1585
29.2M
                        pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1586
29.2M
                        log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1587
1588
29.2M
                }
1589
735M
                else if (abs(pi2_q_dst[j]) >= 1)
1590
14.3M
                {
1591
14.3M
                    QUANT(pi2_q_dst[j],i2_temp,
1592
14.3M
                        pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1593
14.3M
                        log2_size, *pi4_quant_round_factor_1_2);
1594
14.3M
                }
1595
1596
720M
                else
1597
720M
                {
1598
                    /*  Quantization    */
1599
720M
                    QUANT(pi2_q_dst[j],i2_temp,
1600
720M
                        pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1601
720M
                        log2_size, *pi4_quant_round_factor_0_1);
1602
720M
                }
1603
1604
764M
            }
1605
1606
1607
1608
            /*  Inverse Quantization    */
1609
764M
            IQUANT(pi2_iq_dst[j],
1610
764M
                pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1611
764M
                pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
1612
                /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1613
764M
                shift_iq,
1614
764M
                qp_div);
1615
1616
            /*  SSD Computation & Accumulation  */
1617
764M
            val = i2_temp - pi2_iq_dst[j];
1618
764M
            ssd_cost += val*val;
1619
1620
764M
            pi4_quant_round_factor_0_1++;
1621
764M
            pi4_quant_round_factor_1_2++;
1622
764M
        }
1623
1624
57.7M
        pi2_q_dst   += dst_q_strd;
1625
57.7M
        pi2_iq_dst  += dst_iq_strd;
1626
57.7M
        pi2_quant_coeff += trans_size;
1627
57.7M
        pi2_coeffs += src_strd;
1628
57.7M
        pi2_dequant_coeff += trans_size;
1629
57.7M
    }
1630
    /* Store the cost */
1631
7.01M
    *pi8_cost = ssd_cost;
1632
1633
    /* CSBF update */
1634
7.01M
    {
1635
7.01M
        WORD32 block_row, block_col;
1636
7.01M
        WORD32 row, col;
1637
7.01M
        WORD16 *pi2_block;
1638
7.01M
        UWORD32 temp_zero_col = 0;
1639
7.01M
        UWORD32 temp_zero_row = 0;
1640
1641
7.01M
        pi2_q_dst = pi2_q_dst_orig;
1642
1643
21.4M
        for(block_row = 0; block_row < trans_size; block_row += 4)
1644
14.4M
        {
1645
            //block_col is incrementing by 1 for easy update of csbf pointer
1646
62.2M
            for(block_col = 0; block_col < trans_size / 4; block_col++)
1647
47.7M
            {
1648
47.7M
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1649
47.7M
                *(csbf + block_col) = 0;
1650
1651
218M
                for(row = 0; row < 4; row++)
1652
176M
                {
1653
862M
                    for(col = 0; col < 4; col++)
1654
690M
                    {
1655
690M
                        if(pi2_block[row * dst_q_strd + col] != 0)
1656
5.09M
                        {
1657
5.09M
                            *(csbf + block_col) = 1;
1658
5.09M
                            break;
1659
5.09M
                        }
1660
690M
                    }
1661
176M
                    if(*(csbf + block_col) == 1)
1662
5.09M
                    {
1663
                        /* zero_col update *//* temp_zero_col = ~zero_col */
1664
5.09M
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1665
                        // zero col can be optimized further. Now clearing the
1666
                        // entire 4 bits corresponding to 4 colums of 4x4 block
1667
                        // even if any 4x4 csbf is set
1668
1669
                        /* zero row update */ /* temp_zero_row = ~zero_row */
1670
5.09M
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1671
                        // zero row can be optimized further. Now clearing the
1672
                        // entire 4 bits corresponding to 4 rows of 4x4 block
1673
                        // even if any 4x4 csbf is set
1674
1675
5.09M
                        break;
1676
5.09M
                    }
1677
176M
                }
1678
1679
47.7M
                cbf = cbf || (*(csbf + block_col)); // cbf update
1680
47.7M
            }
1681
14.4M
            csbf += csbf_strd;
1682
14.4M
        }
1683
1684
7.01M
        *zero_col = ~temp_zero_col; //final zero_col storing
1685
7.01M
        *zero_row = ~temp_zero_row; //final zero_row storing
1686
7.01M
    }
1687
1688
7.01M
    return cbf;
1689
7.01M
}
1690
1691
WORD32 ihevc_q_iq_var_rnd_fact
1692
    (
1693
    WORD16 *pi2_coeffs,
1694
    WORD16 *pi2_quant_coeff,
1695
    WORD16 *pi2_q_dst,
1696
    WORD16 *pi2_iq_dst,
1697
    WORD32  trans_size,
1698
    WORD32 qp_div,/* qpscaled / 6 */
1699
    WORD32 qp_rem,/* qpscaled % 6 */
1700
    WORD32 q_add,
1701
    WORD32 *pi4_quant_round_factor_0_1,
1702
    WORD32 *pi4_quant_round_factor_1_2,
1703
    WORD32 src_strd,
1704
    WORD32 dst_q_strd,
1705
    WORD32 dst_iq_strd,
1706
    UWORD8 *csbf,
1707
    WORD32 csbf_strd,
1708
    WORD32 *zero_col,
1709
    WORD32 *zero_row,
1710
    WORD16 *pi2_dequant_coeff,
1711
    LWORD64 *pi8_cost
1712
    )
1713
14.8M
{
1714
14.8M
    WORD32 i, j;
1715
14.8M
    WORD32 log2_size;
1716
14.8M
    WORD16 *pi2_q_dst_orig;
1717
14.8M
    WORD32 cbf = 0;
1718
14.8M
    WORD32 bit_depth,shift_iq;
1719
14.8M
    WORD16 i2_temp;
1720
1721
14.8M
    (void)q_add;
1722
14.8M
    (void)pi8_cost;
1723
14.8M
    pi2_q_dst_orig  = pi2_q_dst;
1724
1725
14.8M
    GETRANGE(log2_size, trans_size);
1726
14.8M
    log2_size -= 1;
1727
1728
14.8M
    bit_depth = 8 + 0;
1729
14.8M
    shift_iq = bit_depth + log2_size - 5;
1730
1731
147M
    for(i = 0; i < trans_size; i++)
1732
132M
    {
1733
1.94G
        for(j = 0; j < trans_size; j++)
1734
1.81G
        {
1735
1.81G
            i2_temp = pi2_coeffs[j];
1736
1737
1.81G
            {
1738
1.81G
                QUANT(pi2_q_dst[j],i2_temp,
1739
1.81G
                    pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1740
1.81G
                    log2_size, 0);
1741
1742
1.81G
                if (abs(pi2_q_dst[j]) >= 2)
1743
82.6M
                {
1744
82.6M
                    QUANT(pi2_q_dst[j],i2_temp,
1745
82.6M
                        pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1746
82.6M
                        log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1747
82.6M
                }
1748
1.73G
                else if (abs(pi2_q_dst[j]) >= 1)
1749
18.5M
                {
1750
18.5M
                    QUANT(pi2_q_dst[j],i2_temp,
1751
18.5M
                        pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1752
18.5M
                        log2_size, *pi4_quant_round_factor_1_2);
1753
18.5M
                }
1754
1.71G
                else
1755
1.71G
                {
1756
1.71G
                    QUANT(pi2_q_dst[j],i2_temp,
1757
1.71G
                        pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1758
1.71G
                        log2_size, *pi4_quant_round_factor_0_1);
1759
1.71G
                }
1760
1.81G
            }
1761
1762
1.81G
            IQUANT(pi2_iq_dst[j],
1763
1.81G
                pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1764
1.81G
                pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
1765
1.81G
                shift_iq,
1766
1.81G
                qp_div);
1767
1768
1.81G
            pi4_quant_round_factor_0_1++;
1769
1.81G
            pi4_quant_round_factor_1_2++;
1770
1.81G
        }
1771
1772
132M
        pi2_q_dst   += dst_q_strd;
1773
132M
        pi2_iq_dst  += dst_iq_strd;
1774
132M
        pi2_quant_coeff += trans_size;
1775
132M
        pi2_coeffs += src_strd;
1776
132M
        pi2_dequant_coeff += trans_size;
1777
132M
    }
1778
1779
    /* CSBF update */
1780
14.8M
    {
1781
14.8M
        WORD32 block_row, block_col;
1782
14.8M
        WORD32 row, col;
1783
14.8M
        WORD16 *pi2_block;
1784
14.8M
        UWORD32 temp_zero_col = 0;
1785
14.8M
        UWORD32 temp_zero_row = 0;
1786
1787
14.8M
        pi2_q_dst = pi2_q_dst_orig;
1788
1789
48.0M
        for(block_row = 0; block_row < trans_size; block_row += 4)
1790
33.2M
        {
1791
            //block_col is incrementing by 1 for easy update of csbf pointer
1792
146M
            for(block_col = 0; block_col < trans_size / 4; block_col++)
1793
113M
            {
1794
113M
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
1795
113M
                *(csbf + block_col) = 0;
1796
1797
533M
                for(row = 0; row < 4; row++)
1798
428M
                {
1799
2.11G
                    for(col = 0; col < 4; col++)
1800
1.69G
                    {
1801
1.69G
                        if(pi2_block[row * dst_q_strd + col] != 0)
1802
8.55M
                        {
1803
8.55M
                            *(csbf + block_col) = 1;
1804
8.55M
                            break;
1805
8.55M
                        }
1806
1.69G
                    }
1807
428M
                    if(*(csbf + block_col) == 1)
1808
8.55M
                    {
1809
                        /* zero_col update *//* temp_zero_col = ~zero_col */
1810
8.55M
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
1811
                        // zero col can be optimized further. Now clearing the
1812
                        // entire 4 bits corresponding to 4 colums of 4x4 block
1813
                        // even if any 4x4 csbf is set
1814
1815
                        /* zero row update */ /* temp_zero_row = ~zero_row */
1816
8.55M
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
1817
                        // zero row can be optimized further. Now clearing the
1818
                        // entire 4 bits corresponding to 4 rows of 4x4 block
1819
                        // even if any 4x4 csbf is set
1820
1821
8.55M
                        break;
1822
8.55M
                    }
1823
428M
                }
1824
1825
113M
                cbf = cbf || (*(csbf + block_col)); // cbf update
1826
113M
            }
1827
33.2M
            csbf += csbf_strd;
1828
33.2M
        }
1829
1830
14.8M
        *zero_col = ~temp_zero_col; //final zero_col storing
1831
14.8M
        *zero_row = ~temp_zero_row; //final zero_row storing
1832
14.8M
    }
1833
1834
14.8M
    return cbf;
1835
14.8M
}
1836
1837
/**
1838
*******************************************************************************
1839
*
1840
* @brief
1841
*  This function performs quantization(using flat scale matrix), followed by
1842
*  inverse quantization to find transform domain SSD; when we perform RDOQ.
1843
*  In case the quantized value turns out to be grater than 1, we then requantize
1844
*  use half rounding.
1845
*
1846
* @par Description:
1847
*  Performs quantization on coeffs
1848
*
1849
* @param[in] pi2_coeffs
1850
*  4x4 Coeffs
1851
*
1852
* @param[in] pi2_quant_coeff
1853
*  Scaling Matrix
1854
*
1855
* @param[out] pi2_dst
1856
*  Output 4x4 coefficients
1857
*
1858
* @param[in] qp_div
1859
*  Quantization parameter / 6
1860
*
1861
* @param[in] qp_rem
1862
*  Quantization parameter % 6
1863
*
1864
* @param[in] src_strd
1865
*  Input stride
1866
*
1867
* @param[in] dst_strd
1868
*  Output Stride
1869
*
1870
* @param[out] csbf
1871
*  coded sub block flag
1872
*
1873
* @param[in] csbf_strd
1874
*  coded sub block flag
1875
*
1876
* @param[out] zero_col
1877
*  zero column flag
1878
*
1879
* @param[out] zero_row
1880
*  zero column flag
1881
*
1882
* @returns  cbf
1883
* coded block flag
1884
*
1885
* @remarks
1886
*  None
1887
*
1888
*******************************************************************************
1889
*/
1890
1891
WORD32 ihevc_q_iq_ssd_flat_scale_mat_var_rnd_fact
1892
    (
1893
    WORD16 *pi2_coeffs,
1894
    WORD16 *pi2_quant_coeff,
1895
    WORD16 *pi2_q_dst,
1896
    WORD16 *pi2_iq_dst,
1897
    WORD32  trans_size,
1898
    WORD32 qp_div,/* qpscaled / 6 */
1899
    WORD32 qp_rem,/* qpscaled % 6 */
1900
    WORD32 q_add,
1901
    WORD32 *pi4_quant_round_factor_0_1,
1902
    WORD32 *pi4_quant_round_factor_1_2,
1903
    WORD32 src_strd,
1904
    WORD32 dst_q_strd,
1905
    WORD32 dst_iq_strd,
1906
    UWORD8 *csbf,
1907
    WORD32 csbf_strd,
1908
    WORD32 *zero_col,
1909
    WORD32 *zero_row,
1910
    WORD16 *pi2_dequant_coeff,
1911
    LWORD64 *pi8_cost
1912
    )
1913
10.7M
{
1914
10.7M
    WORD32 i, j;
1915
10.7M
    WORD32 log2_size;
1916
10.7M
    WORD16 *pi2_q_dst_orig;
1917
10.7M
    WORD32 cbf = 0;
1918
10.7M
    WORD32 bit_depth,shift_iq;
1919
10.7M
    WORD32 val;
1920
10.7M
    WORD16 i2_temp;
1921
    /* Initialize cost to zero */
1922
10.7M
    WORD32 ssd_cost = 0;
1923
1924
10.7M
    (void)q_add;
1925
10.7M
    pi2_q_dst_orig  = pi2_q_dst;
1926
1927
    /* Quant initialization */
1928
10.7M
    GETRANGE(log2_size, trans_size);
1929
10.7M
    log2_size -= 1;
1930
1931
10.7M
    bit_depth = 8 + 0;
1932
10.7M
    shift_iq = bit_depth + log2_size - 5;
1933
1934
96.3M
    for(i = 0; i < trans_size; i++)
1935
85.5M
    {
1936
1.17G
        for(j = 0; j < trans_size; j++)
1937
1.09G
        {
1938
1.09G
            WORD16 i2_temp1;
1939
            /*  Back up the coefficients before Quantization    */
1940
1.09G
            i2_temp = pi2_coeffs[j];
1941
1942
            /*QUANT(pi2_dst[j], pi2_coeffs[j],
1943
            pi2_quant_coeff[j] * g_ihevc_quant_scales[qp_rem], qp_div,
1944
            log2_size, q_add);*/
1945
1946
            /* modified by 1028 */
1947
            /*  Quantization    */
1948
1949
1950
1.09G
            {
1951
1.09G
                QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1952
1.09G
                    g_ihevc_quant_scales[qp_rem], qp_div,
1953
1.09G
                    log2_size, 0);
1954
1955
1.09G
                i2_temp1 = pi2_q_dst[j];
1956
1957
1.09G
                if (abs(pi2_q_dst[j]) >= 2)
1958
48.9M
                {
1959
48.9M
                    QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
1960
48.9M
                        g_ihevc_quant_scales[qp_rem], qp_div,
1961
48.9M
                        log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
1962
48.9M
                }
1963
1.04G
                else if (abs(pi2_q_dst[j]) >= 1)
1964
23.3M
                {
1965
23.3M
                    QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1966
23.3M
                        g_ihevc_quant_scales[qp_rem], qp_div,
1967
23.3M
                        log2_size, *pi4_quant_round_factor_1_2);
1968
23.3M
                }
1969
1970
1.01G
                else
1971
1.01G
                {
1972
1.01G
                    QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
1973
1.01G
                        g_ihevc_quant_scales[qp_rem], qp_div,
1974
1.01G
                        log2_size, *pi4_quant_round_factor_0_1);
1975
1.01G
                }
1976
1977
1.09G
            }
1978
1979
1980
1981
1982
1.09G
            ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
1983
1984
1985
            /*  Inverse Quantization    */
1986
1.09G
            IQUANT(pi2_iq_dst[j],
1987
1.09G
                pi2_q_dst[j], /*pi2_src[index*src_strd]*/
1988
1.09G
                pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem], /*pi2_dequant_coeff[index*trans_size] * g_ihevc_iquant_scales[qp_rem] */
1989
1.09G
                shift_iq,
1990
1.09G
                qp_div);
1991
1992
            /*  SSD Computation & Accumulation  */
1993
1.09G
            val = i2_temp - pi2_iq_dst[j];
1994
1.09G
            ssd_cost += val*val;
1995
1996
1.09G
            pi4_quant_round_factor_0_1++;
1997
1.09G
            pi4_quant_round_factor_1_2++;
1998
1.09G
        }
1999
2000
85.5M
        pi2_q_dst   += dst_q_strd;
2001
85.5M
        pi2_iq_dst  += dst_iq_strd;
2002
85.5M
        pi2_quant_coeff += trans_size;
2003
85.5M
        pi2_coeffs += src_strd;
2004
85.5M
        pi2_dequant_coeff += trans_size;
2005
2006
85.5M
    }
2007
    /* Store the cost */
2008
10.7M
    *pi8_cost = ssd_cost;
2009
2010
    /* CSBF update */
2011
10.7M
    {
2012
10.7M
        WORD32 block_row, block_col;
2013
10.7M
        WORD32 row, col;
2014
10.7M
        WORD16 *pi2_block;
2015
10.7M
        UWORD32 temp_zero_col = 0;
2016
10.7M
        UWORD32 temp_zero_row = 0;
2017
2018
10.7M
        pi2_q_dst = pi2_q_dst_orig;
2019
2020
32.1M
        for(block_row = 0; block_row < trans_size; block_row += 4)
2021
21.3M
        {
2022
            //block_col is incrementing by 1 for easy update of csbf pointer
2023
89.5M
            for(block_col = 0; block_col < trans_size / 4; block_col++)
2024
68.2M
            {
2025
68.2M
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
2026
68.2M
                *(csbf + block_col) = 0;
2027
2028
309M
                for(row = 0; row < 4; row++)
2029
249M
                {
2030
1.21G
                    for(col = 0; col < 4; col++)
2031
974M
                    {
2032
974M
                        if(pi2_block[row * dst_q_strd + col] != 0)
2033
8.09M
                        {
2034
8.09M
                            *(csbf + block_col) = 1;
2035
8.09M
                            break;
2036
8.09M
                        }
2037
974M
                    }
2038
249M
                    if(*(csbf + block_col) == 1)
2039
8.09M
                    {
2040
                        /* zero_col update *//* temp_zero_col = ~zero_col */
2041
8.09M
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
2042
                        // zero col can be optimized further. Now clearing the
2043
                        // entire 4 bits corresponding to 4 colums of 4x4 block
2044
                        // even if any 4x4 csbf is set
2045
2046
                        /* zero row update */ /* temp_zero_row = ~zero_row */
2047
8.09M
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
2048
                        // zero row can be optimized further. Now clearing the
2049
                        // entire 4 bits corresponding to 4 rows of 4x4 block
2050
                        // even if any 4x4 csbf is set
2051
2052
8.09M
                        break;
2053
8.09M
                    }
2054
249M
                }
2055
2056
68.2M
                cbf = cbf || (*(csbf + block_col)); // cbf update
2057
68.2M
            }
2058
21.3M
            csbf += csbf_strd;
2059
21.3M
        }
2060
2061
10.7M
        *zero_col = ~temp_zero_col; //final zero_col storing
2062
10.7M
        *zero_row = ~temp_zero_row; //final zero_row storing
2063
10.7M
    }
2064
10.7M
    return cbf;
2065
10.7M
}
2066
2067
WORD32 ihevc_q_iq_flat_scale_mat_var_rnd_fact
2068
    (
2069
    WORD16 *pi2_coeffs,
2070
    WORD16 *pi2_quant_coeff,
2071
    WORD16 *pi2_q_dst,
2072
    WORD16 *pi2_iq_dst,
2073
    WORD32  trans_size,
2074
    WORD32 qp_div,/* qpscaled / 6 */
2075
    WORD32 qp_rem,/* qpscaled % 6 */
2076
    WORD32 q_add,
2077
    WORD32 *pi4_quant_round_factor_0_1,
2078
    WORD32 *pi4_quant_round_factor_1_2,
2079
    WORD32 src_strd,
2080
    WORD32 dst_q_strd,
2081
    WORD32 dst_iq_strd,
2082
    UWORD8 *csbf,
2083
    WORD32 csbf_strd,
2084
    WORD32 *zero_col,
2085
    WORD32 *zero_row,
2086
    WORD16 *pi2_dequant_coeff,
2087
    LWORD64 *pi8_cost
2088
    )
2089
18.4M
{
2090
18.4M
    WORD32 i, j;
2091
18.4M
    WORD32 log2_size;
2092
18.4M
    WORD16 *pi2_q_dst_orig;
2093
18.4M
    WORD32 cbf = 0;
2094
18.4M
    WORD32 bit_depth,shift_iq;
2095
18.4M
    WORD16 i2_temp;
2096
2097
18.4M
    (void)q_add;
2098
18.4M
    (void)pi8_cost;
2099
18.4M
    pi2_q_dst_orig  = pi2_q_dst;
2100
2101
18.4M
    GETRANGE(log2_size, trans_size);
2102
18.4M
    log2_size -= 1;
2103
2104
18.4M
    bit_depth = 8 + 0;
2105
18.4M
    shift_iq = bit_depth + log2_size - 5;
2106
2107
172M
    for(i = 0; i < trans_size; i++)
2108
153M
    {
2109
2.13G
        for(j = 0; j < trans_size; j++)
2110
1.98G
        {
2111
1.98G
            WORD16 i2_temp1;
2112
2113
1.98G
            i2_temp = pi2_coeffs[j];
2114
2115
1.98G
            {
2116
1.98G
                QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
2117
1.98G
                    g_ihevc_quant_scales[qp_rem], qp_div,
2118
1.98G
                    log2_size, 0);
2119
2120
1.98G
                i2_temp1 = pi2_q_dst[j];
2121
2122
1.98G
                if (abs(pi2_q_dst[j]) >= 2)
2123
123M
                {
2124
123M
                    QUANT_NO_WEIGHTMAT(pi2_q_dst[j], i2_temp,
2125
123M
                        g_ihevc_quant_scales[qp_rem], qp_div,
2126
123M
                        log2_size, ((1 << QUANT_ROUND_FACTOR_Q)/2));
2127
123M
                }
2128
1.85G
                else if (abs(pi2_q_dst[j]) >= 1)
2129
21.3M
                {
2130
21.3M
                    QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
2131
21.3M
                        g_ihevc_quant_scales[qp_rem], qp_div,
2132
21.3M
                        log2_size, *pi4_quant_round_factor_1_2);
2133
21.3M
                }
2134
1.83G
                else
2135
1.83G
                {
2136
1.83G
                    QUANT_NO_WEIGHTMAT(pi2_q_dst[j], pi2_coeffs[j],
2137
1.83G
                        g_ihevc_quant_scales[qp_rem], qp_div,
2138
1.83G
                        log2_size, *pi4_quant_round_factor_0_1);
2139
1.83G
                }
2140
1.98G
            }
2141
2142
1.98G
            ASSERT(abs(i2_temp1-pi2_q_dst[j]) <= 1);
2143
2144
1.98G
            IQUANT(pi2_iq_dst[j],
2145
1.98G
                pi2_q_dst[j], /*pi2_src[index*src_strd]*/
2146
1.98G
                pi2_dequant_coeff[j]*g_ihevc_iquant_scales[qp_rem],
2147
1.98G
                shift_iq,
2148
1.98G
                qp_div);
2149
2150
1.98G
            pi4_quant_round_factor_0_1++;
2151
1.98G
            pi4_quant_round_factor_1_2++;
2152
1.98G
        }
2153
2154
153M
        pi2_q_dst   += dst_q_strd;
2155
153M
        pi2_iq_dst  += dst_iq_strd;
2156
153M
        pi2_quant_coeff += trans_size;
2157
153M
        pi2_coeffs += src_strd;
2158
153M
        pi2_dequant_coeff += trans_size;
2159
2160
153M
    }
2161
2162
    /* CSBF update */
2163
18.4M
    {
2164
18.4M
        WORD32 block_row, block_col;
2165
18.4M
        WORD32 row, col;
2166
18.4M
        WORD16 *pi2_block;
2167
18.4M
        UWORD32 temp_zero_col = 0;
2168
18.4M
        UWORD32 temp_zero_row = 0;
2169
2170
18.4M
        pi2_q_dst = pi2_q_dst_orig;
2171
2172
56.8M
        for(block_row = 0; block_row < trans_size; block_row += 4)
2173
38.4M
        {
2174
            //block_col is incrementing by 1 for easy update of csbf pointer
2175
162M
            for(block_col = 0; block_col < trans_size / 4; block_col++)
2176
123M
            {
2177
123M
                pi2_block = pi2_q_dst + block_row * dst_q_strd + block_col * 4;
2178
123M
                *(csbf + block_col) = 0;
2179
2180
572M
                for(row = 0; row < 4; row++)
2181
460M
                {
2182
2.25G
                    for(col = 0; col < 4; col++)
2183
1.80G
                    {
2184
1.80G
                        if(pi2_block[row * dst_q_strd + col] != 0)
2185
11.8M
                        {
2186
11.8M
                            *(csbf + block_col) = 1;
2187
11.8M
                            break;
2188
11.8M
                        }
2189
1.80G
                    }
2190
460M
                    if(*(csbf + block_col) == 1)
2191
11.8M
                    {
2192
                        /* zero_col update *//* temp_zero_col = ~zero_col */
2193
11.8M
                        temp_zero_col = (temp_zero_col) | (0xFU << block_col * 4);
2194
                        // zero col can be optimized further. Now clearing the
2195
                        // entire 4 bits corresponding to 4 colums of 4x4 block
2196
                        // even if any 4x4 csbf is set
2197
2198
                        /* zero row update */ /* temp_zero_row = ~zero_row */
2199
11.8M
                        temp_zero_row = (temp_zero_row) | (0xFU << block_row);
2200
                        // zero row can be optimized further. Now clearing the
2201
                        // entire 4 bits corresponding to 4 rows of 4x4 block
2202
                        // even if any 4x4 csbf is set
2203
2204
11.8M
                        break;
2205
11.8M
                    }
2206
460M
                }
2207
2208
123M
                cbf = cbf || (*(csbf + block_col)); // cbf update
2209
123M
            }
2210
38.4M
            csbf += csbf_strd;
2211
38.4M
        }
2212
2213
18.4M
        *zero_col = ~temp_zero_col; //final zero_col storing
2214
18.4M
        *zero_row = ~temp_zero_row; //final zero_row storing
2215
18.4M
    }
2216
18.4M
    return cbf;
2217
18.4M
}