Coverage Report

Created: 2025-08-28 06:38

/src/libhevc/encoder/ihevce_had_satd.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/**
22
******************************************************************************
23
* @file ihevce_had_satd.c
24
*
25
* @brief
26
*    This file contains functions of Hadamard SAD and SATD
27
*
28
* @author
29
*    Ittiam
30
*
31
* List of Functions
32
*   <TODO: TO BE ADDED>
33
*
34
******************************************************************************
35
*/
36
37
/*****************************************************************************/
38
/* File Includes                                                             */
39
/*****************************************************************************/
40
/* System include files */
41
#include <stdio.h>
42
#include <string.h>
43
#include <stdlib.h>
44
#include <assert.h>
45
#include <stdarg.h>
46
#include <math.h>
47
48
/* User include files */
49
#include "ihevc_typedefs.h"
50
#include "itt_video_api.h"
51
#include "ihevce_api.h"
52
53
#include "rc_cntrl_param.h"
54
#include "rc_frame_info_collector.h"
55
#include "rc_look_ahead_params.h"
56
57
#include "ihevc_defs.h"
58
#include "ihevc_structs.h"
59
#include "ihevc_platform_macros.h"
60
#include "ihevc_deblk.h"
61
#include "ihevc_itrans_recon.h"
62
#include "ihevc_chroma_itrans_recon.h"
63
#include "ihevc_chroma_intra_pred.h"
64
#include "ihevc_intra_pred.h"
65
#include "ihevc_inter_pred.h"
66
#include "ihevc_mem_fns.h"
67
#include "ihevc_padding.h"
68
#include "ihevc_weighted_pred.h"
69
#include "ihevc_sao.h"
70
#include "ihevc_resi_trans.h"
71
#include "ihevc_quant_iquant_ssd.h"
72
#include "ihevc_cabac_tables.h"
73
74
#include "ihevce_defs.h"
75
#include "ihevce_lap_enc_structs.h"
76
#include "ihevce_multi_thrd_structs.h"
77
#include "ihevce_multi_thrd_funcs.h"
78
#include "ihevce_me_common_defs.h"
79
#include "ihevce_had_satd.h"
80
#include "ihevce_error_codes.h"
81
#include "ihevce_bitstream.h"
82
#include "ihevce_cabac.h"
83
#include "ihevce_rdoq_macros.h"
84
#include "ihevce_function_selector.h"
85
#include "ihevce_enc_structs.h"
86
#include "ihevce_cmn_utils_instr_set_router.h"
87
#include "hme_datatype.h"
88
#include "hme_interface.h"
89
#include "hme_common_defs.h"
90
#include "hme_defs.h"
91
92
/*****************************************************************************/
93
/* Function Definitions                                                      */
94
/*****************************************************************************/
95
96
static void ihevce_hadamard_4x4_8bit(
97
    UWORD8 *pu1_src,
98
    WORD32 src_strd,
99
    UWORD8 *pu1_pred,
100
    WORD32 pred_strd,
101
    WORD16 *pi2_dst,
102
    WORD32 dst_strd)
103
1.88G
{
104
1.88G
    WORD32 k;
105
1.88G
    WORD16 m[16];
106
107
    /*===== hadamard horz transform =====*/
108
9.43G
    for(k = 0; k < 4; k++)
109
7.55G
    {
110
7.55G
        WORD32 r0, r1, r2, r3;
111
7.55G
        WORD32 h0, h1, h2, h3;
112
113
        /* Compute the residue block */
114
7.55G
        r0 = pu1_src[0] - pu1_pred[0];
115
7.55G
        r1 = pu1_src[1] - pu1_pred[1];
116
7.55G
        r2 = pu1_src[2] - pu1_pred[2];
117
7.55G
        r3 = pu1_src[3] - pu1_pred[3];
118
119
7.55G
        h0 = r0 + r1;
120
7.55G
        h1 = r0 - r1;
121
7.55G
        h2 = r2 + r3;
122
7.55G
        h3 = r2 - r3;
123
124
7.55G
        m[k * 4 + 0] = h0 + h2;
125
7.55G
        m[k * 4 + 1] = h1 + h3;
126
7.55G
        m[k * 4 + 2] = h0 - h2;
127
7.55G
        m[k * 4 + 3] = h1 - h3;
128
129
7.55G
        pu1_pred += pred_strd;
130
7.55G
        pu1_src += src_strd;
131
7.55G
    }
132
133
    /*===== hadamard vert transform =====*/
134
9.43G
    for(k = 0; k < 4; k++)
135
7.55G
    {
136
7.55G
        WORD32 v0, v1, v2, v3;
137
138
7.55G
        v0 = m[0 + k] + m[4 + k];
139
7.55G
        v1 = m[0 + k] - m[4 + k];
140
7.55G
        v2 = m[8 + k] + m[12 + k];
141
7.55G
        v3 = m[8 + k] - m[12 + k];
142
143
7.55G
        pi2_dst[0 * dst_strd + k] = v0 + v2;
144
7.55G
        pi2_dst[1 * dst_strd + k] = v1 + v3;
145
7.55G
        pi2_dst[2 * dst_strd + k] = v0 - v2;
146
7.55G
        pi2_dst[3 * dst_strd + k] = v1 - v3;
147
7.55G
    }
148
1.88G
}
149
150
static void ihevce_hadamard_8x8_8bit(
151
    UWORD8 *pu1_src,
152
    WORD32 src_strd,
153
    UWORD8 *pu1_pred,
154
    WORD32 pred_strd,
155
    WORD16 *pi2_dst,
156
    WORD32 dst_strd)
157
289M
{
158
289M
    WORD32 i;
159
160
    // y0
161
289M
    ihevce_hadamard_4x4_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
162
    // y1
163
289M
    ihevce_hadamard_4x4_8bit(pu1_src + 4, src_strd, pu1_pred + 4, pred_strd, pi2_dst + 4, dst_strd);
164
    // y2
165
289M
    ihevce_hadamard_4x4_8bit(
166
289M
        pu1_src + 4 * src_strd,
167
289M
        src_strd,
168
289M
        pu1_pred + 4 * pred_strd,
169
289M
        pred_strd,
170
289M
        pi2_dst + (4 * dst_strd),
171
289M
        dst_strd);
172
    // y3
173
289M
    ihevce_hadamard_4x4_8bit(
174
289M
        pu1_src + 4 + 4 * src_strd,
175
289M
        src_strd,
176
289M
        pu1_pred + 4 + 4 * pred_strd,
177
289M
        pred_strd,
178
289M
        pi2_dst + (4 * dst_strd) + 4,
179
289M
        dst_strd);
180
181
    /*   Child HAD results combined as follows to get Parent result */
182
    /*  _                                                 _         */
183
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
184
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
185
    /* \-                                                 -/        */
186
4.92G
    for(i = 0; i < 16; i++)
187
4.63G
    {
188
4.63G
        WORD32 idx = (i >> 2) * dst_strd + (i % 4);
189
4.63G
        WORD16 a0 = pi2_dst[idx];
190
4.63G
        WORD16 a1 = pi2_dst[4 + idx];
191
4.63G
        WORD16 a2 = pi2_dst[(4 * dst_strd) + idx];
192
4.63G
        WORD16 a3 = pi2_dst[(4 * dst_strd) + 4 + idx];
193
194
4.63G
        WORD16 b0 = (a0 + a1);
195
4.63G
        WORD16 b1 = (a0 - a1);
196
4.63G
        WORD16 b2 = (a2 + a3);
197
4.63G
        WORD16 b3 = (a2 - a3);
198
199
4.63G
        pi2_dst[idx] = b0 + b2;
200
4.63G
        pi2_dst[4 + idx] = b1 + b3;
201
4.63G
        pi2_dst[(4 * dst_strd) + idx] = b0 - b2;
202
4.63G
        pi2_dst[(4 * dst_strd) + 4 + idx] = b1 - b3;
203
4.63G
    }
204
289M
}
205
206
static void ihevce_hadamard_16x16_8bit(
207
    UWORD8 *pu1_src,
208
    WORD32 src_strd,
209
    UWORD8 *pu1_pred,
210
    WORD32 pred_strd,
211
    WORD16 *pi2_dst,
212
    WORD32 dst_strd)
213
46.9M
{
214
46.9M
    WORD32 i;
215
216
    // y0
217
46.9M
    ihevce_hadamard_8x8_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
218
    // y1
219
46.9M
    ihevce_hadamard_8x8_8bit(pu1_src + 8, src_strd, pu1_pred + 8, pred_strd, pi2_dst + 8, dst_strd);
220
    // y2
221
46.9M
    ihevce_hadamard_8x8_8bit(
222
46.9M
        pu1_src + 8 * src_strd,
223
46.9M
        src_strd,
224
46.9M
        pu1_pred + 8 * pred_strd,
225
46.9M
        pred_strd,
226
46.9M
        pi2_dst + (8 * dst_strd),
227
46.9M
        dst_strd);
228
    // y3
229
46.9M
    ihevce_hadamard_8x8_8bit(
230
46.9M
        pu1_src + 8 + 8 * src_strd,
231
46.9M
        src_strd,
232
46.9M
        pu1_pred + 8 + 8 * pred_strd,
233
46.9M
        pred_strd,
234
46.9M
        pi2_dst + (8 * dst_strd) + 8,
235
46.9M
        dst_strd);
236
237
    /*   Child HAD results combined as follows to get Parent result */
238
    /*  _                                                 _         */
239
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
240
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
241
    /* \-                                                 -/        */
242
3.04G
    for(i = 0; i < 64; i++)
243
3.00G
    {
244
3.00G
        WORD32 idx = (i >> 3) * dst_strd + (i % 8);
245
3.00G
        WORD16 a0 = pi2_dst[idx];
246
3.00G
        WORD16 a1 = pi2_dst[8 + idx];
247
3.00G
        WORD16 a2 = pi2_dst[(8 * dst_strd) + idx];
248
3.00G
        WORD16 a3 = pi2_dst[(8 * dst_strd) + 8 + idx];
249
250
3.00G
        WORD16 b0 = (a0 + a1) >> 1;
251
3.00G
        WORD16 b1 = (a0 - a1) >> 1;
252
3.00G
        WORD16 b2 = (a2 + a3) >> 1;
253
3.00G
        WORD16 b3 = (a2 - a3) >> 1;
254
255
3.00G
        pi2_dst[idx] = b0 + b2;
256
3.00G
        pi2_dst[8 + idx] = b1 + b3;
257
3.00G
        pi2_dst[(8 * dst_strd) + idx] = b0 - b2;
258
3.00G
        pi2_dst[(8 * dst_strd) + 8 + idx] = b1 - b3;
259
3.00G
    }
260
46.9M
}
261
262
static void ihevce_hadamard_32x32_8bit(
263
    UWORD8 *pu1_src,
264
    WORD32 src_strd,
265
    UWORD8 *pu1_pred,
266
    WORD32 pred_strd,
267
    WORD16 *pi2_dst,
268
    WORD32 dst_strd)
269
4.42M
{
270
4.42M
    WORD32 i;
271
272
    // y0
273
4.42M
    ihevce_hadamard_16x16_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
274
    // y1
275
4.42M
    ihevce_hadamard_16x16_8bit(
276
4.42M
        pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, pi2_dst + 16, dst_strd);
277
    // y2
278
4.42M
    ihevce_hadamard_16x16_8bit(
279
4.42M
        pu1_src + 16 * src_strd,
280
4.42M
        src_strd,
281
4.42M
        pu1_pred + 16 * pred_strd,
282
4.42M
        pred_strd,
283
4.42M
        pi2_dst + (16 * dst_strd),
284
4.42M
        dst_strd);
285
    // y3
286
4.42M
    ihevce_hadamard_16x16_8bit(
287
4.42M
        pu1_src + 16 + 16 * src_strd,
288
4.42M
        src_strd,
289
4.42M
        pu1_pred + 16 + 16 * pred_strd,
290
4.42M
        pred_strd,
291
4.42M
        pi2_dst + (16 * dst_strd) + 16,
292
4.42M
        dst_strd);
293
294
    /*   Child HAD results combined as follows to get Parent result */
295
    /*  _                                                 _         */
296
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
297
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
298
    /* \-                                                 -/        */
299
1.13G
    for(i = 0; i < 256; i++)
300
1.13G
    {
301
1.13G
        WORD32 idx = (i >> 4) * dst_strd + (i % 16);
302
1.13G
        WORD16 a0 = pi2_dst[idx] >> 2;
303
1.13G
        WORD16 a1 = pi2_dst[16 + idx] >> 2;
304
1.13G
        WORD16 a2 = pi2_dst[(16 * dst_strd) + idx] >> 2;
305
1.13G
        WORD16 a3 = pi2_dst[(16 * dst_strd) + 16 + idx] >> 2;
306
307
1.13G
        WORD16 b0 = (a0 + a1);
308
1.13G
        WORD16 b1 = (a0 - a1);
309
1.13G
        WORD16 b2 = (a2 + a3);
310
1.13G
        WORD16 b3 = (a2 - a3);
311
312
1.13G
        pi2_dst[idx] = b0 + b2;
313
1.13G
        pi2_dst[16 + idx] = b1 + b3;
314
1.13G
        pi2_dst[(16 * dst_strd) + idx] = b0 - b2;
315
1.13G
        pi2_dst[(16 * dst_strd) + 16 + idx] = b1 - b3;
316
1.13G
    }
317
4.42M
}
318
319
/**
320
*******************************************************************************
321
*
322
* @brief
323
*  Compute Hadamard sad for 4x4 block with 8-bit input
324
*
325
* @par Description:
326
*
327
* @param[in] pu1_origin
328
*  UWORD8 pointer to the current block
329
*
330
* @param[in] src_strd
331
*  WORD32 Source stride
332
*
333
* @param[in] pu1_pred_buf
334
*  UWORD8 pointer to the prediction block
335
*
336
* @param[in] pred_strd
337
*  WORD32 Pred stride
338
*
339
* @param[in] pi2_dst
340
*  WORD16 pointer to the transform block
341
*
342
* @param[in] dst_strd
343
*  WORD32 Destination stride
344
*
345
* @param[in] size
346
*  WORD32 transform Block size
347
*
348
* @returns hadamard SAD
349
*
350
* @remarks
351
*  Not updating the transform destination now. Only returning the SATD
352
*
353
*******************************************************************************
354
*/
355
UWORD32 ihevce_HAD_4x4_8bit(
356
    UWORD8 *pu1_origin,
357
    WORD32 src_strd,
358
    UWORD8 *pu1_pred_buf,
359
    WORD32 pred_strd,
360
    WORD16 *pi2_dst,
361
    WORD32 dst_strd)
362
293M
{
363
293M
    WORD32 k;
364
293M
    WORD16 v[16];
365
293M
    UWORD32 u4_sad = 0;
366
367
293M
    (void)pi2_dst;
368
293M
    (void)dst_strd;
369
293M
    ihevce_hadamard_4x4_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 4);
370
371
4.98G
    for(k = 0; k < 16; ++k)
372
4.69G
        u4_sad += abs(v[k]);
373
293M
    u4_sad = ((u4_sad + 2) >> 2);
374
375
293M
    return u4_sad;
376
293M
}
377
378
/**
379
*******************************************************************************
380
*
381
* @brief
382
*  Computes Hadamard Sad for 8x8 block with 8-bit input
383
*
384
* @par Description:
385
*
386
* @param[in] pu1_origin
387
*  UWORD8 pointer to the current block
388
*
389
* @param[in] src_strd
390
*  WORD32 Source stride
391
*
392
* @param[in] pu1_pred_buf
393
*  UWORD8 pointer to the prediction block
394
*
395
* @param[in] pred_strd
396
*  WORD32 Pred stride
397
*
398
* @param[in] pi2_dst
399
*  WORD16 pointer to the transform block
400
*
401
* @param[in] dst_strd
402
*  WORD32 Destination stride
403
*
404
* @param[in] size
405
*  WORD32 transform Block size
406
*
407
* @returns Hadamard SAD
408
*
409
* @remarks
410
*  Not updating the transform destination now. Only returning the SATD
411
*
412
*******************************************************************************
413
*/
414
UWORD32 ihevce_HAD_8x8_8bit(
415
    UWORD8 *pu1_origin,
416
    WORD32 src_strd,
417
    UWORD8 *pu1_pred_buf,
418
    WORD32 pred_strd,
419
    WORD16 *pi2_dst,
420
    WORD32 dst_strd)
421
102M
{
422
102M
    WORD32 k;
423
102M
    UWORD32 u4_sad = 0;
424
102M
    WORD16 v[64];
425
426
102M
    (void)pi2_dst;
427
102M
    (void)dst_strd;
428
102M
    ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
429
430
6.64G
    for(k = 0; k < 64; ++k)
431
6.53G
        u4_sad += abs(v[k]);
432
102M
    u4_sad = ((u4_sad + 4) >> 3);
433
434
102M
    return u4_sad;
435
102M
}
436
437
/**
438
*******************************************************************************
439
*
440
* @brief
441
*  Compute dc suppressed hadamard sad for 8x8 block with 8-bit input
442
*
443
* @par Description:
444
*
445
* @param[in] pu1_origin
446
*  UWORD8 pointer to the current block
447
*
448
* @param[in] src_strd
449
*  WORD32 Source stride
450
*
451
* @param[in] pu1_pred_buf
452
*  UWORD8 pointer to the prediction block
453
*
454
* @param[in] pred_strd
455
*  WORD32 Pred stride
456
*
457
* @param[in] pi2_dst
458
*  WORD16 pointer to the transform block
459
*
460
* @param[in] dst_strd
461
*  WORD32 Destination stride
462
*
463
* @param[in] size
464
*  WORD32 transform Block size
465
*
466
* @returns Hadamard SAD with DC Suppressed
467
*
468
* @remarks
469
*  Not updating the transform destination now. Only returning the SATD
470
*
471
*******************************************************************************
472
*/
473
UWORD32 ihevce_compute_ac_had_8x8_8bit(
474
    UWORD8 *pu1_origin,
475
    WORD32 src_strd,
476
    UWORD8 *pu1_pred_buf,
477
    WORD32 pred_strd,
478
    WORD16 *pi2_dst,
479
    WORD32 dst_strd)
480
0
{
481
0
    WORD32 k;
482
0
    UWORD32 u4_sad = 0;
483
0
    WORD16 v[64];
484
485
0
    (void)pi2_dst;
486
0
    (void)dst_strd;
487
0
    ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
488
489
0
    v[0] = 0;
490
0
    for(k = 0; k < 64; ++k)
491
0
        u4_sad += abs(v[k]);
492
0
    u4_sad = ((u4_sad + 4) >> 3);
493
494
0
    return u4_sad;
495
0
}
496
497
/**
498
*******************************************************************************
499
*
500
* @brief
501
*  Computes Hadamard Sad for 16x16 block with 8-bit input
502
*
503
* @par Description:
504
*
505
* @param[in] pu1_origin
506
*  UWORD8 pointer to the current block
507
*
508
* @param[in] src_strd
509
*  WORD32 Source stride
510
*
511
* @param[in] pu1_pred_buf
512
*  UWORD8 pointer to the prediction block
513
*
514
* @param[in] pred_strd
515
*  WORD32 Pred stride
516
*
517
* @param[in] pi2_dst
518
*  WORD16 pointer to the transform block
519
*
520
* @param[in] dst_strd
521
*  WORD32 Destination stride
522
*
523
* @param[in] size
524
*  WORD32 transform Block size
525
*
526
* @returns Hadamard SAD
527
*
528
* @remarks
529
*  Not updating the transform destination now. Only returning the SATD
530
*
531
*******************************************************************************
532
*/
533
UWORD32 ihevce_HAD_16x16_8bit(
534
    UWORD8 *pu1_origin,
535
    WORD32 src_strd,
536
    UWORD8 *pu1_pred_buf,
537
    WORD32 pred_strd,
538
    WORD16 *pi2_dst,
539
    WORD32 dst_strd)
540
29.2M
{
541
29.2M
    WORD32 k;
542
29.2M
    UWORD32 u4_sad = 0;
543
29.2M
    WORD16 v[256];
544
545
29.2M
    (void)pi2_dst;
546
29.2M
    (void)dst_strd;
547
29.2M
    ihevce_hadamard_16x16_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 16);
548
549
7.50G
    for(k = 0; k < 256; ++k)
550
7.47G
        u4_sad += abs(v[k]);
551
29.2M
    u4_sad = ((u4_sad + 4) >> 3);
552
553
29.2M
    return u4_sad;
554
29.2M
}
555
556
/**
557
*******************************************************************************
558
*
559
* @brief
560
*  Computes Hadamard Sad for 32x32 block with 8-bit input
561
*
562
* @par Description:
563
*
564
* @param[in] pu1_origin
565
*  UWORD8 pointer to the current block
566
*
567
* @param[in] src_strd
568
*  WORD32 Source stride
569
*
570
* @param[in] pu1_pred_buf
571
*  UWORD8 pointer to the prediction block
572
*
573
* @param[in] pred_strd
574
*  WORD32 Pred stride
575
*
576
* @param[in] pi2_dst
577
*  WORD16 pointer to the transform block
578
*
579
* @param[in] dst_strd
580
*  WORD32 Destination stride
581
*
582
* @param[in] size
583
*  WORD32 transform Block size
584
*
585
* @returns Hadamard SAD
586
*
587
* @remarks
588
*  Not updating the transform destination now. Only returning the SATD
589
*
590
*******************************************************************************
591
*/
592
UWORD32 ihevce_HAD_32x32_8bit(
593
    UWORD8 *pu1_origin,
594
    WORD32 src_strd,
595
    UWORD8 *pu1_pred_buf,
596
    WORD32 pred_strd,
597
    WORD16 *pi2_dst,
598
    WORD32 dst_strd)
599
4.42M
{
600
4.42M
    WORD32 k;
601
4.42M
    UWORD32 u4_sad = 0;
602
4.42M
    WORD16 v[32 * 32];
603
604
4.42M
    (void)pi2_dst;
605
4.42M
    (void)dst_strd;
606
4.42M
    ihevce_hadamard_32x32_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 32);
607
608
4.53G
    for(k = 0; k < 32 * 32; ++k)
609
4.53G
        u4_sad += abs(v[k]);
610
4.42M
    u4_sad = ((u4_sad + 2) >> 2);
611
612
4.42M
    return u4_sad;
613
4.42M
}
614
615
//#if COMPUTE_16x16_R == C
616
/**
617
*******************************************************************************
618
*
619
* @brief
620
*   Computes 8x8 transform using children 4x4 hadamard results
621
*
622
* @par Description:
623
*
624
* @param[in] pi2_4x4_had
625
*  WORD16 pointer to 4x4 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
626
*
627
* @param[in] had4_strd
628
*  stride of 4x4 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
629
*
630
* @param[out] pi2_dst
631
*  destination buffer where 8x8 hadamard result is stored
632
*
633
* @param[in] dst_stride
634
*  stride of destination block
635
*
636
* @param[in] i4_frm_qstep
637
*  frm_qstep value based on the which the threshold value is calculated
638
*
639
* @returns
640
*  8x8 Hadamard SATD
641
* @remarks
642
*
643
*******************************************************************************
644
*/
645
static UWORD32 ihevce_compute_8x8HAD_using_4x4(
646
    WORD16 *pi2_4x4_had,
647
    WORD32 had4_strd,
648
    WORD16 *pi2_dst,
649
    WORD32 dst_strd,
650
    WORD32 i4_frm_qstep,
651
    WORD32 *pi4_cbf)
652
31.7M
{
653
    /* Qstep value is right shifted by 8 */
654
31.7M
    WORD32 threshold = (i4_frm_qstep >> 8);
655
656
    /* Initialize pointers to 4 subblocks of 4x4 HAD buffer */
657
31.7M
    WORD16 *pi2_y0 = pi2_4x4_had;
658
31.7M
    WORD16 *pi2_y1 = pi2_4x4_had + 4;
659
31.7M
    WORD16 *pi2_y2 = pi2_4x4_had + had4_strd * 4;
660
31.7M
    WORD16 *pi2_y3 = pi2_4x4_had + had4_strd * 4 + 4;
661
662
    /* Initialize pointers to store 8x8 HAD output */
663
31.7M
    WORD16 *pi2_dst0 = pi2_dst;
664
31.7M
    WORD16 *pi2_dst1 = pi2_dst + 4;
665
31.7M
    WORD16 *pi2_dst2 = pi2_dst + dst_strd * 4;
666
31.7M
    WORD16 *pi2_dst3 = pi2_dst + dst_strd * 4 + 4;
667
668
31.7M
    UWORD32 u4_satd = 0;
669
31.7M
    WORD32 i;
670
671
    /*   Child HAD results combined as follows to get Parent result */
672
    /*  _                                                 _         */
673
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
674
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
675
    /* \-                                                 -/        */
676
539M
    for(i = 0; i < 16; i++)
677
507M
    {
678
507M
        WORD32 src_idx = (i >> 2) * had4_strd + (i % 4);
679
507M
        WORD32 dst_idx = (i >> 2) * dst_strd + (i % 4);
680
681
507M
        WORD16 a0 = pi2_y0[src_idx];
682
507M
        WORD16 a1 = pi2_y1[src_idx];
683
507M
        WORD16 a2 = pi2_y2[src_idx];
684
507M
        WORD16 a3 = pi2_y3[src_idx];
685
686
507M
        WORD16 b0 = (a0 + a1);
687
507M
        WORD16 b1 = (a0 - a1);
688
507M
        WORD16 b2 = (a2 + a3);
689
507M
        WORD16 b3 = (a2 - a3);
690
691
507M
        pi2_dst0[dst_idx] = b0 + b2;
692
507M
        pi2_dst1[dst_idx] = b1 + b3;
693
507M
        pi2_dst2[dst_idx] = b0 - b2;
694
507M
        pi2_dst3[dst_idx] = b1 - b3;
695
696
507M
        if(ABS(pi2_dst0[dst_idx]) > threshold)
697
20.6M
            *pi4_cbf = 1;
698
507M
        if(ABS(pi2_dst1[dst_idx]) > threshold)
699
20.0M
            *pi4_cbf = 1;
700
507M
        if(ABS(pi2_dst2[dst_idx]) > threshold)
701
20.2M
            *pi4_cbf = 1;
702
507M
        if(ABS(pi2_dst3[dst_idx]) > threshold)
703
19.8M
            *pi4_cbf = 1;
704
705
507M
        u4_satd += ABS(pi2_dst0[dst_idx]);
706
507M
        u4_satd += ABS(pi2_dst1[dst_idx]);
707
507M
        u4_satd += ABS(pi2_dst2[dst_idx]);
708
507M
        u4_satd += ABS(pi2_dst3[dst_idx]);
709
507M
    }
710
711
    /* return the 8x8 satd */
712
31.7M
    return (u4_satd);
713
31.7M
}
714
715
/**
716
*******************************************************************************
717
*
718
* @brief
719
*    Computes Residue and Hadamard Transform for four 4x4 blocks (Z scan) of
720
*    a 8x8 block (Residue is computed for 8-bit src and prediction buffers)
721
*    Modified to incorporate the dead-zone implementation - Lokesh
722
*
723
* @par Description:
724
*
725
* @param[in] pu1_origin
726
*  UWORD8 pointer to the current block
727
*
728
* @param[in] src_strd
729
*  WORD32 Source stride
730
*
731
* @param[in] pu1_pred
732
*  UWORD8 pointer to the prediction block
733
*
734
* @param[in] pred_strd
735
*  WORD32 Pred stride
736
*
737
* @param[out] pi2_dst
738
*  WORD16 pointer to the transform block
739
*
740
* @param[in] dst_strd
741
*  WORD32 Destination stride
742
*
743
* @param[out] pi4_hsad
744
*  array for storing hadmard sad of each 4x4 block
745
*
746
* @param[in] hsad_stride
747
*  stride of hadmard sad destination buffer (for Zscan order of storing sads)
748
*
749
* @param[in] i4_frm_qstep
750
*  frm_qstep value based on the which the threshold value is calculated
751
*
752
* @returns
753
*
754
* @remarks
755
*
756
*******************************************************************************
757
*/
758
static WORD32 ihevce_had4_4x4(
759
    UWORD8 *pu1_src,
760
    WORD32 src_strd,
761
    UWORD8 *pu1_pred,
762
    WORD32 pred_strd,
763
    WORD16 *pi2_dst4x4,
764
    WORD32 dst_strd,
765
    WORD32 *pi4_hsad,
766
    WORD32 hsad_stride,
767
    WORD32 i4_frm_qstep)
768
108M
{
769
108M
    WORD32 i, k;
770
108M
    WORD32 i4_child_total_sad = 0;
771
772
108M
    (void)i4_frm_qstep;
773
    /* -------- Compute four 4x4 HAD Transforms ---------*/
774
543M
    for(i = 0; i < 4; i++)
775
434M
    {
776
434M
        UWORD8 *pu1_pi0, *pu1_pi1;
777
434M
        WORD16 *pi2_dst;
778
434M
        WORD32 blkx, blky;
779
434M
        UWORD32 u4_hsad = 0;
780
        // TODO: choose deadzone as f(qstep)
781
434M
        WORD32 threshold = 0;
782
783
        /*****************************************************/
784
        /*    Assuming the looping structure of the four     */
785
        /*    blocks is in Z scan order of 4x4s in a 8x8     */
786
        /*    block instead of raster scan                   */
787
        /*****************************************************/
788
434M
        blkx = (i & 0x1);
789
434M
        blky = (i >> 1);
790
791
434M
        pu1_pi0 = pu1_src + (blkx * 4) + (blky * 4 * src_strd);
792
434M
        pu1_pi1 = pu1_pred + (blkx * 4) + (blky * 4 * pred_strd);
793
434M
        pi2_dst = pi2_dst4x4 + (blkx * 4) + (blky * 4 * dst_strd);
794
795
434M
        ihevce_hadamard_4x4_8bit(pu1_pi0, src_strd, pu1_pi1, pred_strd, pi2_dst, dst_strd);
796
797
2.17G
        for(k = 0; k < 4; k++)
798
1.73G
        {
799
1.73G
            if(ABS(pi2_dst[0 * dst_strd + k]) < threshold)
800
0
                pi2_dst[0 * dst_strd + k] = 0;
801
802
1.73G
            if(ABS(pi2_dst[1 * dst_strd + k]) < threshold)
803
0
                pi2_dst[1 * dst_strd + k] = 0;
804
805
1.73G
            if(ABS(pi2_dst[2 * dst_strd + k]) < threshold)
806
0
                pi2_dst[2 * dst_strd + k] = 0;
807
808
1.73G
            if(ABS(pi2_dst[3 * dst_strd + k]) < threshold)
809
0
                pi2_dst[3 * dst_strd + k] = 0;
810
811
            /* Accumulate the SATD */
812
1.73G
            u4_hsad += ABS(pi2_dst[0 * dst_strd + k]);
813
1.73G
            u4_hsad += ABS(pi2_dst[1 * dst_strd + k]);
814
1.73G
            u4_hsad += ABS(pi2_dst[2 * dst_strd + k]);
815
1.73G
            u4_hsad += ABS(pi2_dst[3 * dst_strd + k]);
816
1.73G
        }
817
818
        /*===== Normalize the HSAD =====*/
819
434M
        pi4_hsad[blkx + (blky * hsad_stride)] = ((u4_hsad + 2) >> 2);
820
434M
        i4_child_total_sad += ((u4_hsad + 2) >> 2);
821
434M
    }
822
108M
    return i4_child_total_sad;
823
108M
}
824
825
/**
826
*******************************************************************************
827
*
828
* @brief
829
*    HSAD is returned for the 4, 4x4 in 8x8
830
*
831
* @par Description:
832
*
833
* @param[in] pu1_origin
834
*  UWORD8 pointer to the current block
835
*
836
* @param[in] src_strd
837
*  WORD32 Source stride
838
*
839
* @param[in] pu1_pred
840
*  UWORD8 pointer to the prediction block
841
*
842
* @param[in] pred_strd
843
*  WORD32 Pred stride
844
*
845
* @param[out] pi2_dst
846
*  WORD16 pointer to the transform output block
847
*
848
* @param[out] dst_strd
849
*  WORD32 Destination stride
850
*
851
* @param[out] ppi4_hsad
852
*   pointer to base pointers for storing hadmard sads of various
853
*   block sizes (4x4 to 32x32)
854
*
855
* @param[in] pos_x_y_4x4
856
*   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
857
*   Lower 16bits denote xpos and upper 16ypos of the 4x4block
858
*
859
* @param[in] num_4x4_in_row
860
*   Denotes the number of current 4x4 blocks in a ctb/CU/MB
861
*
862
* @returns
863
*
864
* @remarks
865
*
866
*******************************************************************************
867
*/
868
void ihevce_had_8x8_using_4_4x4(
869
    UWORD8 *pu1_src,
870
    WORD32 src_strd,
871
    UWORD8 *pu1_pred,
872
    WORD32 pred_strd,
873
    WORD16 *pi2_dst,
874
    WORD32 dst_strd,
875
    WORD32 **ppi4_hsad,
876
    WORD32 pos_x_y_4x4,
877
    WORD32 num_4x4_in_row)
878
76.9M
{
879
76.9M
    WORD16 ai2_4x4_had[64];
880
76.9M
    WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
881
76.9M
    WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
882
76.9M
    WORD32 *pi4_4x4_hsad;
883
76.9M
    WORD32 *pi4_8x8_hsad;
884
885
76.9M
    (void)pi2_dst;
886
76.9M
    (void)dst_strd;
887
76.9M
    ASSERT(pos_x >= 0);
888
76.9M
    ASSERT(pos_y >= 0);
889
890
    /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
891
76.9M
    pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
892
76.9M
    pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
893
894
    /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
895
76.9M
    pi4_8x8_hsad[0] = ihevce_had4_4x4(
896
76.9M
        pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
897
76.9M
}
898
899
/**
900
*******************************************************************************
901
*
902
* @brief
903
*    Reursive Hadamard Transform for 8x8 block. HSAD is returned for the 8x8
904
*    block and its four subblocks(4x4).
905
*
906
* @par Description:
907
*
908
* @param[in] pu1_origin
909
*  UWORD8 pointer to the current block
910
*
911
* @param[in] src_strd
912
*  WORD32 Source stride
913
*
914
* @param[in] pu1_pred
915
*  UWORD8 pointer to the prediction block
916
*
917
* @param[in] pred_strd
918
*  WORD32 Pred stride
919
*
920
* @param[out] pi2_dst
921
*  WORD16 pointer to the transform output block
922
*
923
* @param[out] dst_strd
924
*  WORD32 Destination stride
925
*
926
* @param[out] ppi4_hsad
927
*   pointer to base pointers for storing hadmard sads of various
928
*   block sizes (4x4 to 32x32)
929
*
930
* @param[in] pos_x_y_4x4
931
*   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
932
*   Lower 16bits denote xpos and upper 16ypos of the 4x4block
933
*
934
* @param[in] num_4x4_in_row
935
*   Denotes the number of current 4x4 blocks in a ctb/CU/MB
936
*
937
* @param[in] i4_frm_qstep
938
*  frm_qstep value based on the which the threshold value is calculated
939
*
940
* @returns
941
*
942
* @remarks
943
*
944
*******************************************************************************
945
*/
946
WORD32 ihevce_had_8x8_using_4_4x4_r(
947
    UWORD8 *pu1_src,
948
    WORD32 src_strd,
949
    UWORD8 *pu1_pred,
950
    WORD32 pred_strd,
951
    WORD16 *pi2_dst,
952
    WORD32 dst_strd,
953
    WORD32 **ppi4_hsad,
954
    WORD32 **ppi4_tu_split,
955
    WORD32 **ppi4_tu_early_cbf,
956
    WORD32 pos_x_y_4x4,
957
    WORD32 num_4x4_in_row,
958
    WORD32 lambda,
959
    WORD32 lambda_q_shift,
960
    WORD32 i4_frm_qstep,
961
    WORD32 i4_cur_depth,
962
    WORD32 i4_max_depth,
963
    WORD32 i4_max_tr_size,
964
    WORD32 *pi4_tu_split_cost,
965
    void *pv_func_sel)
966
31.7M
{
967
31.7M
    WORD16 ai2_4x4_had[64];
968
31.7M
    WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
969
31.7M
    WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
970
31.7M
    WORD32 *pi4_4x4_hsad;
971
31.7M
    WORD32 *pi4_8x8_hsad;
972
31.7M
    WORD32 *pi4_8x8_tu_split;
973
974
31.7M
    WORD32 *pi4_8x8_tu_early_cbf;
975
976
31.7M
    UWORD32 u4_satd;
977
31.7M
    WORD32 cost_child = 0, cost_parent = 0;
978
31.7M
    WORD32 early_cbf = 0;
979
980
31.7M
    const UWORD8 u1_cur_tr_size = 8;
981
    /* Stores the best cost for the Current 8x8: Lokesh */
982
31.7M
    WORD32 best_cost = 0;
983
984
31.7M
    (void)pv_func_sel;
985
31.7M
    ASSERT(pos_x >= 0);
986
31.7M
    ASSERT(pos_y >= 0);
987
988
    /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
989
31.7M
    pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
990
31.7M
    pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
991
31.7M
    pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
992
31.7M
    pi4_8x8_tu_early_cbf =
993
31.7M
        ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
994
995
    /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
996
31.7M
    cost_child = ihevce_had4_4x4(
997
31.7M
        pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
998
999
    /* -------- Compute 8x8 HAD Transform using 4x4 results ------------- */
1000
31.7M
    u4_satd = ihevce_compute_8x8HAD_using_4x4(
1001
31.7M
        ai2_4x4_had, 8, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1002
1003
    /* store the normalized 8x8 satd */
1004
31.7M
    cost_parent = ((u4_satd + 4) >> 3);
1005
1006
    /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1007
31.7M
    cost_child += ((4) * lambda) >> (lambda_q_shift + 1);
1008
1009
31.7M
    if(i4_cur_depth < i4_max_depth)
1010
16.4M
    {
1011
16.4M
        if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1012
757k
        {
1013
            //cost_child -= ((4) * lambda) >> (lambda_q_shift + 1);
1014
757k
            *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1);
1015
757k
            best_cost = cost_child;
1016
757k
            best_cost <<= 1;
1017
757k
            best_cost++;
1018
757k
            pi4_8x8_tu_split[0] = 1;
1019
757k
            pi4_8x8_hsad[0] = cost_child;
1020
757k
        }
1021
15.7M
        else
1022
15.7M
        {
1023
            //cost_parent -= ((1) * lambda) >>  (lambda_q_shift + 1);
1024
15.7M
            best_cost = cost_parent;
1025
15.7M
            best_cost <<= 1;
1026
15.7M
            pi4_8x8_tu_split[0] = 0;
1027
15.7M
            pi4_8x8_hsad[0] = cost_parent;
1028
15.7M
        }
1029
16.4M
    }
1030
15.2M
    else
1031
15.2M
    {
1032
        //cost_parent -= ((1) * lambda) >>  (lambda_q_shift + 1);
1033
15.2M
        best_cost = cost_parent;
1034
15.2M
        best_cost <<= 1;
1035
15.2M
        pi4_8x8_tu_split[0] = 0;
1036
15.2M
        pi4_8x8_hsad[0] = cost_parent;
1037
15.2M
    }
1038
1039
31.7M
    pi4_8x8_tu_early_cbf[0] = early_cbf;
1040
1041
    /* best cost has tu_split_flag at LSB(Least significant bit) */
1042
31.7M
    return ((best_cost << 1) + early_cbf);
1043
31.7M
}
1044
1045
/**
1046
*******************************************************************************
1047
*
1048
* @brief
1049
*   Computes 16x16 transform using children 8x8 hadamard results
1050
*    Modified to incorporate the dead-zone implementation - Lokesh
1051
*
1052
* @par Description:
1053
*
1054
* @param[in] pi2_8x8_had
1055
*  WORD16 pointer to 8x8 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
1056
*
1057
* @param[in] had8_strd
1058
*  stride of 8x8 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
1059
*
1060
* @param[out] pi2_dst
1061
*  destination buffer where 8x8 hadamard result is stored
1062
*
1063
* @param[in] dst_stride
1064
*  stride of destination block
1065
*
1066
* @param[in] i4_frm_qstep
1067
*  frm_qstep value based on the which the threshold value is calculated
1068
*
1069
* @returns
1070
*  16x16 Hadamard SATD
1071
* @remarks
1072
*
1073
*******************************************************************************
1074
*/
1075
static UWORD32 ihevce_compute_16x16HAD_using_8x8(
1076
    WORD16 *pi2_8x8_had,
1077
    WORD32 had8_strd,
1078
    WORD16 *pi2_dst,
1079
    WORD32 dst_strd,
1080
    WORD32 i4_frm_qstep,
1081
    WORD32 *pi4_cbf)
1082
7.19M
{
1083
    /* Qstep value is right shifted by 8 */
1084
7.19M
    WORD32 threshold = (i4_frm_qstep >> 8);
1085
1086
    /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
1087
7.19M
    WORD16 *pi2_y0 = pi2_8x8_had;
1088
7.19M
    WORD16 *pi2_y1 = pi2_8x8_had + 8;
1089
7.19M
    WORD16 *pi2_y2 = pi2_8x8_had + had8_strd * 8;
1090
7.19M
    WORD16 *pi2_y3 = pi2_8x8_had + had8_strd * 8 + 8;
1091
1092
    /* Initialize pointers to store 8x8 HAD output */
1093
7.19M
    WORD16 *pi2_dst0 = pi2_dst;
1094
7.19M
    WORD16 *pi2_dst1 = pi2_dst + 8;
1095
7.19M
    WORD16 *pi2_dst2 = pi2_dst + dst_strd * 8;
1096
7.19M
    WORD16 *pi2_dst3 = pi2_dst + dst_strd * 8 + 8;
1097
1098
7.19M
    UWORD32 u4_satd = 0;
1099
7.19M
    WORD32 i;
1100
1101
    /*   Child HAD results combined as follows to get Parent result */
1102
    /*  _                                                 _         */
1103
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
1104
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
1105
    /* \-                                                 -/        */
1106
467M
    for(i = 0; i < 64; i++)
1107
460M
    {
1108
460M
        WORD32 src_idx = (i >> 3) * had8_strd + (i % 8);
1109
460M
        WORD32 dst_idx = (i >> 3) * dst_strd + (i % 8);
1110
1111
460M
        WORD16 a0 = pi2_y0[src_idx];
1112
460M
        WORD16 a1 = pi2_y1[src_idx];
1113
460M
        WORD16 a2 = pi2_y2[src_idx];
1114
460M
        WORD16 a3 = pi2_y3[src_idx];
1115
1116
460M
        WORD16 b0 = (a0 + a1) >> 1;
1117
460M
        WORD16 b1 = (a0 - a1) >> 1;
1118
460M
        WORD16 b2 = (a2 + a3) >> 1;
1119
460M
        WORD16 b3 = (a2 - a3) >> 1;
1120
1121
460M
        pi2_dst0[dst_idx] = b0 + b2;
1122
460M
        pi2_dst1[dst_idx] = b1 + b3;
1123
460M
        pi2_dst2[dst_idx] = b0 - b2;
1124
460M
        pi2_dst3[dst_idx] = b1 - b3;
1125
1126
        /* Make the value of dst to zerp, if it falls below the dead-zone */
1127
460M
        if(ABS(pi2_dst0[dst_idx]) > threshold)
1128
31.9M
            *pi4_cbf = 1;
1129
460M
        if(ABS(pi2_dst1[dst_idx]) > threshold)
1130
31.7M
            *pi4_cbf = 1;
1131
460M
        if(ABS(pi2_dst2[dst_idx]) > threshold)
1132
31.6M
            *pi4_cbf = 1;
1133
460M
        if(ABS(pi2_dst3[dst_idx]) > threshold)
1134
31.4M
            *pi4_cbf = 1;
1135
1136
460M
        u4_satd += ABS(pi2_dst0[dst_idx]);
1137
460M
        u4_satd += ABS(pi2_dst1[dst_idx]);
1138
460M
        u4_satd += ABS(pi2_dst2[dst_idx]);
1139
460M
        u4_satd += ABS(pi2_dst3[dst_idx]);
1140
460M
    }
1141
1142
    /* return 16x16 satd */
1143
7.19M
    return (u4_satd);
1144
7.19M
}
1145
1146
/**
1147
*******************************************************************************
1148
*
1149
* @brief
1150
*    Hadamard Transform for 16x16 block with 8x8 and 4x4 SATD updates.
1151
*    Uses recursive 8x8 had output to compute satd for 16x16 and its children
1152
*
1153
* @par Description:
1154
*
1155
* @param[in] pu1_origin
1156
*  UWORD8 pointer to the current block
1157
*
1158
* @param[in] src_strd
1159
*  WORD32 Source stride
1160
*
1161
* @param[in] pu1_pred
1162
*  UWORD8 pointer to the prediction block
1163
*
1164
* @param[in] pred_strd
1165
*  WORD32 Pred stride
1166
*
1167
* @param[out] pi2_dst
1168
*  WORD16 pointer to the transform output block
1169
*
1170
* @param[out] dst_strd
1171
*  WORD32 Destination stride
1172
*
1173
* @param[out] ppi4_hsad
1174
*   pointer to base pointers for storing hadmard sads of various
1175
*   block sizes (4x4 to 32x32)
1176
*
1177
* @param[in] pos_x_y_4x4
1178
*   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
1179
*   Lower 16bits denote xpos and upper 16ypos of the 4x4block
1180
*
1181
* @param[in] num_4x4_in_row
1182
*   Denotes the number of current 4x4 blocks in a ctb/CU/MB
1183
*
1184
* @param[in] lambda
1185
*  lambda values is the cost factor calculated based on QP
1186
*
1187
* @param[in] lambda_q_shift
1188
*  lambda_q_shift used to reverse the lambda value back from q8 format
1189
*
1190
* @param[in] depth
1191
*  depth gives the current TU depth with respect to the CU
1192
*
1193
* @param[in] i4_frm_qstep
1194
*  frm_qstep value based on the which the threshold value is calculated
1195
*
1196
* @returns
1197
*
1198
* @remarks
1199
*
1200
*******************************************************************************
1201
*/
1202
1203
WORD32 ihevce_had_16x16_r(
1204
    UWORD8 *pu1_src,
1205
    WORD32 src_strd,
1206
    UWORD8 *pu1_pred,
1207
    WORD32 pred_strd,
1208
    WORD16 *pi2_dst,
1209
    WORD32 dst_strd,
1210
    WORD32 **ppi4_hsad,
1211
    WORD32 **ppi4_tu_split,
1212
    WORD32 **ppi4_tu_early_cbf,
1213
    WORD32 pos_x_y_4x4,
1214
    WORD32 num_4x4_in_row,
1215
    WORD32 lambda,
1216
    WORD32 lambda_q_shift,
1217
    WORD32 i4_frm_qstep,
1218
    WORD32 i4_cur_depth,
1219
    WORD32 i4_max_depth,
1220
    WORD32 i4_max_tr_size,
1221
    WORD32 *pi4_tu_split_cost,
1222
    void *pv_func_sel)
1223
7.19M
{
1224
7.19M
    WORD16 ai2_8x8_had[256];
1225
7.19M
    WORD32 *pi4_16x16_hsad;
1226
7.19M
    WORD32 *pi4_16x16_tu_split;
1227
1228
7.19M
    WORD32 *pi4_16x16_tu_early_cbf;
1229
1230
7.19M
    UWORD32 u4_satd = 0;
1231
7.19M
    WORD32 tu_split_flag = 0;
1232
7.19M
    WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1233
7.19M
    const UWORD8 u1_cur_tr_size = 16;
1234
1235
    /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
1236
    /* cost_child : Stores the cost of the child HAD transform (16x16) */
1237
7.19M
    WORD32 cost_parent = 0, cost_child = 0;
1238
1239
    /*best_cost returns the best cost at the end of the function */
1240
    /*tu_split denoes whether the TU (16x16)is split or not */
1241
7.19M
    WORD32 best_cost = 0, best_cost_tu_split;
1242
7.19M
    WORD32 i;
1243
1244
7.19M
    WORD16 *pi2_y0;
1245
7.19M
    UWORD8 *pu1_src0;
1246
7.19M
    UWORD8 *pu1_pred0;
1247
7.19M
    WORD32 pos_x_y_4x4_0;
1248
1249
7.19M
    WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1250
7.19M
    WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1251
1252
7.19M
    ASSERT(pos_x >= 0);
1253
7.19M
    ASSERT(pos_y >= 0);
1254
1255
    /* Initialize pointers to  store 16x16 SATDs */
1256
7.19M
    pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1257
1258
7.19M
    pi4_16x16_tu_split =
1259
7.19M
        ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1260
1261
7.19M
    pi4_16x16_tu_early_cbf =
1262
7.19M
        ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1263
1264
    /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1265
35.9M
    for(i = 0; i < 4; i++)
1266
28.7M
    {
1267
28.7M
        pu1_src0 = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8;
1268
28.7M
        pu1_pred0 = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8;
1269
28.7M
        pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1270
28.7M
        pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1271
1272
28.7M
        best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r(
1273
28.7M
            pu1_src0,
1274
28.7M
            src_strd,
1275
28.7M
            pu1_pred0,
1276
28.7M
            pred_strd,
1277
28.7M
            pi2_y0,
1278
28.7M
            16,
1279
28.7M
            ppi4_hsad,
1280
28.7M
            ppi4_tu_split,
1281
28.7M
            ppi4_tu_early_cbf,
1282
28.7M
            pos_x_y_4x4_0,
1283
28.7M
            num_4x4_in_row,
1284
28.7M
            lambda,
1285
28.7M
            lambda_q_shift,
1286
28.7M
            i4_frm_qstep,
1287
28.7M
            i4_cur_depth + 1,
1288
28.7M
            i4_max_depth,
1289
28.7M
            i4_max_tr_size,
1290
28.7M
            pi4_tu_split_cost,
1291
28.7M
            pv_func_sel);
1292
1293
        /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */
1294
28.7M
        best_cost = (best_cost_tu_split >> 2);
1295
1296
        /* Last but one bit stores the information regarding the TU_Split */
1297
28.7M
        tu_split_flag += (best_cost_tu_split & 0x3) >> 1;
1298
1299
        /* Last bit stores the information regarding the early_cbf */
1300
28.7M
        i4_early_cbf_flag += (best_cost_tu_split & 0x1);
1301
1302
28.7M
        cost_child += best_cost;
1303
1304
28.7M
        tu_split_flag <<= 1;
1305
28.7M
        i4_early_cbf_flag <<= 1;
1306
28.7M
    }
1307
1308
    /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */
1309
7.19M
    pi2_y0 = ai2_8x8_had;
1310
1311
    /* Threshold currently passed as "0" */
1312
7.19M
    u4_satd =
1313
7.19M
        ihevce_compute_16x16HAD_using_8x8(pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1314
1315
    /* store the normalized satd */
1316
7.19M
    cost_parent = ((u4_satd + 4) >> 3);
1317
1318
    /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1319
7.19M
    cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1320
1321
7.19M
    i4_early_cbf_flag += early_cbf;
1322
1323
    /* Right now the depth is hard-coded to 4: The depth can be modified from the config file
1324
    which decides the extent to which TU_REC needs to be done */
1325
7.19M
    if(i4_cur_depth < i4_max_depth)
1326
5.25M
    {
1327
5.25M
        if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1328
524k
        {
1329
            //cost_child -= ((4 + 4)  * lambda) >> (lambda_q_shift + 1);
1330
524k
            *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1331
524k
            tu_split_flag += 1;
1332
524k
            best_cost = cost_child;
1333
524k
        }
1334
4.72M
        else
1335
4.72M
        {
1336
            //cost_parent -= ((1 + 1) * lambda) >>  (lambda_q_shift + 1);
1337
4.72M
            tu_split_flag += 0;
1338
4.72M
            best_cost = cost_parent;
1339
4.72M
        }
1340
5.25M
    }
1341
1.94M
    else
1342
1.94M
    {
1343
        //cost_parent -= ((1 + 1) * lambda) >>  (lambda_q_shift + 1);
1344
1.94M
        tu_split_flag += 0;
1345
1.94M
        best_cost = cost_parent;
1346
1.94M
    }
1347
1348
7.19M
    pi4_16x16_hsad[0] = best_cost;
1349
7.19M
    pi4_16x16_tu_split[0] = tu_split_flag;
1350
7.19M
    pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag;
1351
1352
    /*returning two values(best cost & tu_split_flag) as a single value*/
1353
7.19M
    return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag);
1354
7.19M
}
1355
1356
//#endif
1357
/**
1358
*******************************************************************************
1359
*
1360
* @brief
1361
*   Computes 32x32 transform using children 16x16 hadamard results
1362
*
1363
* @par Description:
1364
*
1365
* @param[in] pi2_16x16_had
1366
*  WORD16 pointer to 16x16 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
1367
*
1368
* @param[in] had16_strd
1369
*  stride of 16x16 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
1370
*
1371
* @param[out] pi2_dst
1372
*  destination buffer where 16x16 hadamard result is stored
1373
*
1374
* @param[in] dst_stride
1375
*  stride of destination block
1376
*
1377
* @param[in] i4_frm_qstep
1378
*  frm_qstep value based on the which the threshold value is calculated
1379
*
1380
* @returns
1381
*  32x32 Hadamard SATD
1382
* @remarks
1383
*
1384
*******************************************************************************
1385
*/
1386
//#if COMPUTE_32x32_USING_16X16 == C
1387
UWORD32 ihevce_compute_32x32HAD_using_16x16(
1388
    WORD16 *pi2_16x16_had,
1389
    WORD32 had16_strd,
1390
    WORD16 *pi2_dst,
1391
    WORD32 dst_strd,
1392
    WORD32 i4_frm_qstep,
1393
    WORD32 *pi4_cbf)
1394
1.12M
{
1395
    /* Qstep value is right shifted by 8 */
1396
1.12M
    WORD32 threshold = (i4_frm_qstep >> 8);
1397
1398
    /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
1399
1.12M
    WORD16 *pi2_y0 = pi2_16x16_had;
1400
1.12M
    WORD16 *pi2_y1 = pi2_16x16_had + 16;
1401
1.12M
    WORD16 *pi2_y2 = pi2_16x16_had + had16_strd * 16;
1402
1.12M
    WORD16 *pi2_y3 = pi2_16x16_had + had16_strd * 16 + 16;
1403
1404
    /* Initialize pointers to store 8x8 HAD output */
1405
1.12M
    WORD16 *pi2_dst0 = pi2_dst;
1406
1.12M
    WORD16 *pi2_dst1 = pi2_dst + 16;
1407
1.12M
    WORD16 *pi2_dst2 = pi2_dst + dst_strd * 16;
1408
1.12M
    WORD16 *pi2_dst3 = pi2_dst + dst_strd * 16 + 16;
1409
1410
1.12M
    UWORD32 u4_satd = 0;
1411
1.12M
    WORD32 i;
1412
1413
    /*   Child HAD results combined as follows to get Parent result */
1414
    /*  _                                                 _         */
1415
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
1416
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
1417
    /* \-                                                 -/        */
1418
289M
    for(i = 0; i < 256; i++)
1419
288M
    {
1420
288M
        WORD32 src_idx = (i >> 4) * had16_strd + (i % 16);
1421
288M
        WORD32 dst_idx = (i >> 4) * dst_strd + (i % 16);
1422
1423
288M
        WORD16 a0 = pi2_y0[src_idx] >> 2;
1424
288M
        WORD16 a1 = pi2_y1[src_idx] >> 2;
1425
288M
        WORD16 a2 = pi2_y2[src_idx] >> 2;
1426
288M
        WORD16 a3 = pi2_y3[src_idx] >> 2;
1427
1428
288M
        WORD16 b0 = (a0 + a1);
1429
288M
        WORD16 b1 = (a0 - a1);
1430
288M
        WORD16 b2 = (a2 + a3);
1431
288M
        WORD16 b3 = (a2 - a3);
1432
1433
288M
        pi2_dst0[dst_idx] = b0 + b2;
1434
288M
        pi2_dst1[dst_idx] = b1 + b3;
1435
288M
        pi2_dst2[dst_idx] = b0 - b2;
1436
288M
        pi2_dst3[dst_idx] = b1 - b3;
1437
1438
        /* Make the value of dst to zerp, if it falls below the dead-zone */
1439
288M
        if(ABS(pi2_dst0[dst_idx]) > threshold)
1440
35.0M
            *pi4_cbf = 1;
1441
288M
        if(ABS(pi2_dst1[dst_idx]) > threshold)
1442
34.8M
            *pi4_cbf = 1;
1443
288M
        if(ABS(pi2_dst2[dst_idx]) > threshold)
1444
34.8M
            *pi4_cbf = 1;
1445
288M
        if(ABS(pi2_dst3[dst_idx]) > threshold)
1446
34.7M
            *pi4_cbf = 1;
1447
1448
288M
        u4_satd += ABS(pi2_dst0[dst_idx]);
1449
288M
        u4_satd += ABS(pi2_dst1[dst_idx]);
1450
288M
        u4_satd += ABS(pi2_dst2[dst_idx]);
1451
288M
        u4_satd += ABS(pi2_dst3[dst_idx]);
1452
288M
    }
1453
1454
    /* return 32x32 satd */
1455
1.12M
    return (u4_satd);
1456
1.12M
}
1457
//#endif
1458
1459
/**
1460
*******************************************************************************
1461
*
1462
* @brief
1463
*    Hadamard Transform for 32x32 block with 16x6, 8x8 and 4x4 SATD updates.
1464
*    Uses recursive 16x16 had output to compute satd for 32x32 and its children
1465
*
1466
* @par Description:
1467
*
1468
* @param[in] pu1_origin
1469
*  UWORD8 pointer to the current block
1470
*
1471
* @param[in] src_strd
1472
*  WORD32 Source stride
1473
*
1474
* @param[in] pu1_pred
1475
*  UWORD8 pointer to the prediction block
1476
*
1477
* @param[in] pred_strd
1478
*  WORD32 Pred stride
1479
*
1480
* @param[out] pi2_dst
1481
*  WORD16 pointer to the transform output block
1482
*
1483
* @param[out] dst_strd
1484
*  WORD32 Destination stride
1485
*
1486
* @param[out] ppi4_hsad
1487
*   pointer to base pointers for storing hadmard sads of various
1488
*   block sizes (4x4 to 32x32)
1489
*
1490
* @param[in] pos_x_y_4x4
1491
*   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
1492
*   Lower 16bits denote xpos and upper 16ypos of the 4x4block
1493
*
1494
* @param[in] num_4x4_in_row
1495
*   Denotes the number of current 4x4 blocks in a ctb/CU/MB
1496
*
1497
* @param[in] lambda
1498
*  lambda values is the cost factor calculated based on QP
1499
*
1500
* @param[in] lambda_q_shift
1501
*  lambda_q_shift used to reverse the lambda value back from q8 format
1502
*
1503
* @param[in] depth
1504
*  depth gives the current TU depth with respect to the CU
1505
*
1506
* @param[in] i4_frm_qstep
1507
*  frm_qstep value based on the which the threshold value is calculated
1508
*
1509
*
1510
* @returns
1511
*
1512
* @remarks
1513
*
1514
*******************************************************************************
1515
*/
1516
void ihevce_had_32x32_r(
1517
    UWORD8 *pu1_src,
1518
    WORD32 src_strd,
1519
    UWORD8 *pu1_pred,
1520
    WORD32 pred_strd,
1521
    WORD16 *pi2_dst,
1522
    WORD32 dst_strd,
1523
    WORD32 **ppi4_hsad,
1524
    WORD32 **ppi4_tu_split,
1525
    WORD32 **ppi4_tu_early_cbf,
1526
    WORD32 pos_x_y_4x4,
1527
    WORD32 num_4x4_in_row,
1528
    WORD32 lambda,
1529
    WORD32 lambda_q_shift,
1530
    WORD32 i4_frm_qstep,
1531
    WORD32 i4_cur_depth,
1532
    WORD32 i4_max_depth,
1533
    WORD32 i4_max_tr_size,
1534
    WORD32 *pi4_tu_split_cost,
1535
    me_func_selector_t *ps_func_selector)
1536
1537
1.12M
{
1538
1.12M
    WORD16 ai2_16x16_had[1024];
1539
1.12M
    WORD32 *pi4_32x32_hsad;
1540
1.12M
    WORD32 *pi4_32x32_tu_split;
1541
1.12M
    WORD32 *pi4_32x32_tu_early_cbf;
1542
1543
1.12M
    WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1544
1.12M
    WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1545
1.12M
    WORD32 tu_split_flag = 0;
1546
1.12M
    const UWORD8 u1_cur_tr_size = 32;
1547
1.12M
    WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1548
1549
    /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
1550
    /* cost_child : Stores the cost of the child HAD transform (16x16) */
1551
1.12M
    WORD32 cost_child = 0, cost_parent = 0;
1552
1553
    /*retuned as the best cost for the entire TU (32x32) */
1554
1.12M
    WORD32 best_cost = 0;
1555
    /*captures the best cost and tu_split at child level */
1556
1.12M
    WORD32 best_cost_tu_split;
1557
1558
    /* Initialize pointers to 4 8x8 blocks in 16x16 */
1559
1.12M
    WORD16 *pi2_y0 = ai2_16x16_had;
1560
1.12M
    WORD16 *pi2_y1 = ai2_16x16_had + 16;
1561
1.12M
    WORD16 *pi2_y2 = ai2_16x16_had + 32 * 16;
1562
1.12M
    WORD16 *pi2_y3 = ai2_16x16_had + 32 * 16 + 16;
1563
1564
1.12M
    UWORD8 *pu1_src0 = pu1_src;
1565
1.12M
    UWORD8 *pu1_src1 = pu1_src + 16;
1566
1.12M
    UWORD8 *pu1_src2 = pu1_src + src_strd * 16;
1567
1.12M
    UWORD8 *pu1_src3 = pu1_src + src_strd * 16 + 16;
1568
1569
1.12M
    UWORD8 *pu1_pred0 = pu1_pred;
1570
1.12M
    UWORD8 *pu1_pred1 = pu1_pred + 16;
1571
1.12M
    UWORD8 *pu1_pred2 = pu1_pred + pred_strd * 16;
1572
1.12M
    UWORD8 *pu1_pred3 = pu1_pred + pred_strd * 16 + 16;
1573
1574
1.12M
    ASSERT(pos_x >= 0);
1575
1.12M
    ASSERT(pos_y >= 0);
1576
1577
    /* Initialize pointers to store 32x32 SATDs */
1578
1.12M
    pi4_32x32_hsad = ppi4_hsad[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1579
1580
1.12M
    pi4_32x32_tu_split =
1581
1.12M
        ppi4_tu_split[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1582
1583
1.12M
    pi4_32x32_tu_early_cbf =
1584
1.12M
        ppi4_tu_early_cbf[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1585
1586
    /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1587
1.12M
    best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1588
1.12M
        pu1_src0,
1589
1.12M
        src_strd,
1590
1.12M
        pu1_pred0,
1591
1.12M
        pred_strd,
1592
1.12M
        pi2_y0,
1593
1.12M
        32,
1594
1.12M
        ppi4_hsad,
1595
1.12M
        ppi4_tu_split,
1596
1.12M
        ppi4_tu_early_cbf,
1597
1.12M
        pos_x_y_4x4,
1598
1.12M
        num_4x4_in_row,
1599
1.12M
        lambda,
1600
1.12M
        lambda_q_shift,
1601
1.12M
        i4_frm_qstep,
1602
1.12M
        i4_cur_depth + 1,
1603
1.12M
        i4_max_depth,
1604
1.12M
        i4_max_tr_size,
1605
1.12M
        pi4_tu_split_cost,
1606
1.12M
        NULL);
1607
1608
    /* cost is shifted by 10bits */
1609
1.12M
    best_cost = best_cost_tu_split >> 10;
1610
1611
    /* Tu split is present in the 6-10 bits */
1612
1.12M
    tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1613
1614
    /*Early CBF info is present in the last 5 bits */
1615
1.12M
    i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1616
1617
1.12M
    tu_split_flag <<= 5;
1618
1.12M
    i4_early_cbf_flag <<= 5;
1619
1620
1.12M
    cost_child += best_cost;
1621
1622
1.12M
    best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1623
1.12M
        pu1_src1,
1624
1.12M
        src_strd,
1625
1.12M
        pu1_pred1,
1626
1.12M
        pred_strd,
1627
1.12M
        pi2_y1,
1628
1.12M
        32,
1629
1.12M
        ppi4_hsad,
1630
1.12M
        ppi4_tu_split,
1631
1.12M
        ppi4_tu_early_cbf,
1632
1.12M
        pos_x_y_4x4 + 4,
1633
1.12M
        num_4x4_in_row,
1634
1.12M
        lambda,
1635
1.12M
        lambda_q_shift,
1636
1.12M
        i4_frm_qstep,
1637
1.12M
        i4_cur_depth + 1,
1638
1.12M
        i4_max_depth,
1639
1.12M
        i4_max_tr_size,
1640
1.12M
        pi4_tu_split_cost,
1641
1.12M
        NULL);
1642
1643
    /* cost is shifted by 10bits */
1644
1.12M
    best_cost = best_cost_tu_split >> 10;
1645
1646
    /* Tu split is present in the 6-10 bits */
1647
1.12M
    tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1648
1649
    /*Early CBF info is present in the last 5 bits */
1650
1.12M
    i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1651
1652
1.12M
    tu_split_flag <<= 5;
1653
1.12M
    i4_early_cbf_flag <<= 5;
1654
1655
1.12M
    cost_child += best_cost;
1656
1657
1.12M
    best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1658
1.12M
        pu1_src2,
1659
1.12M
        src_strd,
1660
1.12M
        pu1_pred2,
1661
1.12M
        pred_strd,
1662
1.12M
        pi2_y2,
1663
1.12M
        32,
1664
1.12M
        ppi4_hsad,
1665
1.12M
        ppi4_tu_split,
1666
1.12M
        ppi4_tu_early_cbf,
1667
1.12M
        pos_x_y_4x4 + (4 << 16),
1668
1.12M
        num_4x4_in_row,
1669
1.12M
        lambda,
1670
1.12M
        lambda_q_shift,
1671
1.12M
        i4_frm_qstep,
1672
1.12M
        i4_cur_depth + 1,
1673
1.12M
        i4_max_depth,
1674
1.12M
        i4_max_tr_size,
1675
1.12M
        pi4_tu_split_cost,
1676
1.12M
        NULL);
1677
1678
    /* cost is shifted by 10bits */
1679
1.12M
    best_cost = best_cost_tu_split >> 10;
1680
1681
    /* Tu split is present in the 6-10 bits */
1682
1.12M
    tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1683
1684
    /*Early CBF info is present in the last 5 bits */
1685
1.12M
    i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1686
1687
1.12M
    tu_split_flag <<= 5;
1688
1.12M
    i4_early_cbf_flag <<= 5;
1689
1690
1.12M
    cost_child += best_cost;
1691
1692
1.12M
    best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1693
1.12M
        pu1_src3,
1694
1.12M
        src_strd,
1695
1.12M
        pu1_pred3,
1696
1.12M
        pred_strd,
1697
1.12M
        pi2_y3,
1698
1.12M
        32,
1699
1.12M
        ppi4_hsad,
1700
1.12M
        ppi4_tu_split,
1701
1.12M
        ppi4_tu_early_cbf,
1702
1.12M
        pos_x_y_4x4 + (4 << 16) + 4,
1703
1.12M
        num_4x4_in_row,
1704
1.12M
        lambda,
1705
1.12M
        lambda_q_shift,
1706
1.12M
        i4_frm_qstep,
1707
1.12M
        i4_cur_depth + 1,
1708
1.12M
        i4_max_depth,
1709
1.12M
        i4_max_tr_size,
1710
1.12M
        pi4_tu_split_cost,
1711
1.12M
        NULL);
1712
1713
    /* cost is shifted by 10bits */
1714
1.12M
    best_cost = best_cost_tu_split >> 10;
1715
1716
    /* Tu split is present in the 6-10 bits */
1717
1.12M
    tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1718
1719
    /*Early CBF info is present in the last 5 bits */
1720
1.12M
    i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1721
1722
1.12M
    tu_split_flag <<= 1;
1723
1.12M
    i4_early_cbf_flag <<= 1;
1724
1725
1.12M
    cost_child += best_cost;
1726
1727
1.12M
    {
1728
1.12M
        UWORD32 u4_satd = 0;
1729
1730
1.12M
        u4_satd = ps_func_selector->pf_compute_32x32HAD_using_16x16(
1731
1.12M
            pi2_y0, 32, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1732
1733
1.12M
        cost_parent = ((u4_satd + 2) >> 2);
1734
1.12M
    }
1735
1736
    /* 4 TU_Split flags , 4 CBF Flags*/
1737
1.12M
    cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1738
1739
1.12M
    i4_early_cbf_flag += early_cbf;
1740
1741
    /* 1 TU_SPlit flag, 1 CBF flag */
1742
    //cost_parent += ((1 + 1)* lambda) >>  (lambda_q_shift + 1);
1743
1744
1.12M
    if(i4_cur_depth < i4_max_depth)
1745
1.06M
    {
1746
1.06M
        if((cost_child < cost_parent) || (u1_cur_tr_size > i4_max_tr_size))
1747
266k
        {
1748
266k
            *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1749
266k
            best_cost = cost_child;
1750
266k
            tu_split_flag++;
1751
266k
        }
1752
795k
        else
1753
795k
        {
1754
795k
            tu_split_flag = 0;
1755
795k
            best_cost = cost_parent;
1756
795k
        }
1757
1.06M
    }
1758
63.7k
    else
1759
63.7k
    {
1760
63.7k
        tu_split_flag = 0;
1761
63.7k
        best_cost = cost_parent;
1762
63.7k
    }
1763
1764
1.12M
    pi4_32x32_tu_split[0] = tu_split_flag;
1765
1766
1.12M
    pi4_32x32_hsad[0] = best_cost;
1767
1768
1.12M
    pi4_32x32_tu_early_cbf[0] = i4_early_cbf_flag;
1769
1.12M
}