Coverage Report

Created: 2026-02-14 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libhevc/encoder/ihevce_had_satd.c
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2018 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/**
22
******************************************************************************
23
* @file ihevce_had_satd.c
24
*
25
* @brief
26
*    This file contains functions of Hadamard SAD and SATD
27
*
28
* @author
29
*    Ittiam
30
*
31
* List of Functions
32
*   <TODO: TO BE ADDED>
33
*
34
******************************************************************************
35
*/
36
37
/*****************************************************************************/
38
/* File Includes                                                             */
39
/*****************************************************************************/
40
/* System include files */
41
#include <stdio.h>
42
#include <string.h>
43
#include <stdlib.h>
44
#include <assert.h>
45
#include <stdarg.h>
46
#include <math.h>
47
48
/* User include files */
49
#include "ihevc_typedefs.h"
50
#include "itt_video_api.h"
51
#include "ihevce_api.h"
52
53
#include "rc_cntrl_param.h"
54
#include "rc_frame_info_collector.h"
55
#include "rc_look_ahead_params.h"
56
57
#include "ihevc_defs.h"
58
#include "ihevc_structs.h"
59
#include "ihevc_platform_macros.h"
60
#include "ihevc_deblk.h"
61
#include "ihevc_itrans_recon.h"
62
#include "ihevc_chroma_itrans_recon.h"
63
#include "ihevc_chroma_intra_pred.h"
64
#include "ihevc_intra_pred.h"
65
#include "ihevc_inter_pred.h"
66
#include "ihevc_mem_fns.h"
67
#include "ihevc_padding.h"
68
#include "ihevc_weighted_pred.h"
69
#include "ihevc_sao.h"
70
#include "ihevc_resi_trans.h"
71
#include "ihevc_quant_iquant_ssd.h"
72
#include "ihevc_cabac_tables.h"
73
74
#include "ihevce_defs.h"
75
#include "ihevce_lap_enc_structs.h"
76
#include "ihevce_multi_thrd_structs.h"
77
#include "ihevce_multi_thrd_funcs.h"
78
#include "ihevce_me_common_defs.h"
79
#include "ihevce_had_satd.h"
80
#include "ihevce_error_codes.h"
81
#include "ihevce_bitstream.h"
82
#include "ihevce_cabac.h"
83
#include "ihevce_rdoq_macros.h"
84
#include "ihevce_function_selector.h"
85
#include "ihevce_enc_structs.h"
86
#include "ihevce_cmn_utils_instr_set_router.h"
87
#include "hme_datatype.h"
88
#include "hme_interface.h"
89
#include "hme_common_defs.h"
90
#include "hme_defs.h"
91
92
/*****************************************************************************/
93
/* Function Definitions                                                      */
94
/*****************************************************************************/
95
96
static void ihevce_hadamard_4x4_8bit(
97
    UWORD8 *pu1_src,
98
    WORD32 src_strd,
99
    UWORD8 *pu1_pred,
100
    WORD32 pred_strd,
101
    WORD16 *pi2_dst,
102
    WORD32 dst_strd)
103
99.7M
{
104
99.7M
    WORD32 k;
105
99.7M
    WORD16 m[16];
106
107
    /*===== hadamard horz transform =====*/
108
498M
    for(k = 0; k < 4; k++)
109
398M
    {
110
398M
        WORD32 r0, r1, r2, r3;
111
398M
        WORD32 h0, h1, h2, h3;
112
113
        /* Compute the residue block */
114
398M
        r0 = pu1_src[0] - pu1_pred[0];
115
398M
        r1 = pu1_src[1] - pu1_pred[1];
116
398M
        r2 = pu1_src[2] - pu1_pred[2];
117
398M
        r3 = pu1_src[3] - pu1_pred[3];
118
119
398M
        h0 = r0 + r1;
120
398M
        h1 = r0 - r1;
121
398M
        h2 = r2 + r3;
122
398M
        h3 = r2 - r3;
123
124
398M
        m[k * 4 + 0] = h0 + h2;
125
398M
        m[k * 4 + 1] = h1 + h3;
126
398M
        m[k * 4 + 2] = h0 - h2;
127
398M
        m[k * 4 + 3] = h1 - h3;
128
129
398M
        pu1_pred += pred_strd;
130
398M
        pu1_src += src_strd;
131
398M
    }
132
133
    /*===== hadamard vert transform =====*/
134
498M
    for(k = 0; k < 4; k++)
135
398M
    {
136
398M
        WORD32 v0, v1, v2, v3;
137
138
398M
        v0 = m[0 + k] + m[4 + k];
139
398M
        v1 = m[0 + k] - m[4 + k];
140
398M
        v2 = m[8 + k] + m[12 + k];
141
398M
        v3 = m[8 + k] - m[12 + k];
142
143
398M
        pi2_dst[0 * dst_strd + k] = v0 + v2;
144
398M
        pi2_dst[1 * dst_strd + k] = v1 + v3;
145
398M
        pi2_dst[2 * dst_strd + k] = v0 - v2;
146
398M
        pi2_dst[3 * dst_strd + k] = v1 - v3;
147
398M
    }
148
99.7M
}
149
150
static void ihevce_hadamard_8x8_8bit(
151
    UWORD8 *pu1_src,
152
    WORD32 src_strd,
153
    UWORD8 *pu1_pred,
154
    WORD32 pred_strd,
155
    WORD16 *pi2_dst,
156
    WORD32 dst_strd)
157
12.6M
{
158
12.6M
    WORD32 i;
159
160
    // y0
161
12.6M
    ihevce_hadamard_4x4_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
162
    // y1
163
12.6M
    ihevce_hadamard_4x4_8bit(pu1_src + 4, src_strd, pu1_pred + 4, pred_strd, pi2_dst + 4, dst_strd);
164
    // y2
165
12.6M
    ihevce_hadamard_4x4_8bit(
166
12.6M
        pu1_src + 4 * src_strd,
167
12.6M
        src_strd,
168
12.6M
        pu1_pred + 4 * pred_strd,
169
12.6M
        pred_strd,
170
12.6M
        pi2_dst + (4 * dst_strd),
171
12.6M
        dst_strd);
172
    // y3
173
12.6M
    ihevce_hadamard_4x4_8bit(
174
12.6M
        pu1_src + 4 + 4 * src_strd,
175
12.6M
        src_strd,
176
12.6M
        pu1_pred + 4 + 4 * pred_strd,
177
12.6M
        pred_strd,
178
12.6M
        pi2_dst + (4 * dst_strd) + 4,
179
12.6M
        dst_strd);
180
181
    /*   Child HAD results combined as follows to get Parent result */
182
    /*  _                                                 _         */
183
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
184
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
185
    /* \-                                                 -/        */
186
215M
    for(i = 0; i < 16; i++)
187
203M
    {
188
203M
        WORD32 idx = (i >> 2) * dst_strd + (i % 4);
189
203M
        WORD16 a0 = pi2_dst[idx];
190
203M
        WORD16 a1 = pi2_dst[4 + idx];
191
203M
        WORD16 a2 = pi2_dst[(4 * dst_strd) + idx];
192
203M
        WORD16 a3 = pi2_dst[(4 * dst_strd) + 4 + idx];
193
194
203M
        WORD16 b0 = (a0 + a1);
195
203M
        WORD16 b1 = (a0 - a1);
196
203M
        WORD16 b2 = (a2 + a3);
197
203M
        WORD16 b3 = (a2 - a3);
198
199
203M
        pi2_dst[idx] = b0 + b2;
200
203M
        pi2_dst[4 + idx] = b1 + b3;
201
203M
        pi2_dst[(4 * dst_strd) + idx] = b0 - b2;
202
203M
        pi2_dst[(4 * dst_strd) + 4 + idx] = b1 - b3;
203
203M
    }
204
12.6M
}
205
206
static void ihevce_hadamard_16x16_8bit(
207
    UWORD8 *pu1_src,
208
    WORD32 src_strd,
209
    UWORD8 *pu1_pred,
210
    WORD32 pred_strd,
211
    WORD16 *pi2_dst,
212
    WORD32 dst_strd)
213
1.73M
{
214
1.73M
    WORD32 i;
215
216
    // y0
217
1.73M
    ihevce_hadamard_8x8_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
218
    // y1
219
1.73M
    ihevce_hadamard_8x8_8bit(pu1_src + 8, src_strd, pu1_pred + 8, pred_strd, pi2_dst + 8, dst_strd);
220
    // y2
221
1.73M
    ihevce_hadamard_8x8_8bit(
222
1.73M
        pu1_src + 8 * src_strd,
223
1.73M
        src_strd,
224
1.73M
        pu1_pred + 8 * pred_strd,
225
1.73M
        pred_strd,
226
1.73M
        pi2_dst + (8 * dst_strd),
227
1.73M
        dst_strd);
228
    // y3
229
1.73M
    ihevce_hadamard_8x8_8bit(
230
1.73M
        pu1_src + 8 + 8 * src_strd,
231
1.73M
        src_strd,
232
1.73M
        pu1_pred + 8 + 8 * pred_strd,
233
1.73M
        pred_strd,
234
1.73M
        pi2_dst + (8 * dst_strd) + 8,
235
1.73M
        dst_strd);
236
237
    /*   Child HAD results combined as follows to get Parent result */
238
    /*  _                                                 _         */
239
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
240
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
241
    /* \-                                                 -/        */
242
112M
    for(i = 0; i < 64; i++)
243
111M
    {
244
111M
        WORD32 idx = (i >> 3) * dst_strd + (i % 8);
245
111M
        WORD16 a0 = pi2_dst[idx];
246
111M
        WORD16 a1 = pi2_dst[8 + idx];
247
111M
        WORD16 a2 = pi2_dst[(8 * dst_strd) + idx];
248
111M
        WORD16 a3 = pi2_dst[(8 * dst_strd) + 8 + idx];
249
250
111M
        WORD16 b0 = (a0 + a1) >> 1;
251
111M
        WORD16 b1 = (a0 - a1) >> 1;
252
111M
        WORD16 b2 = (a2 + a3) >> 1;
253
111M
        WORD16 b3 = (a2 - a3) >> 1;
254
255
111M
        pi2_dst[idx] = b0 + b2;
256
111M
        pi2_dst[8 + idx] = b1 + b3;
257
111M
        pi2_dst[(8 * dst_strd) + idx] = b0 - b2;
258
111M
        pi2_dst[(8 * dst_strd) + 8 + idx] = b1 - b3;
259
111M
    }
260
1.73M
}
261
262
static void ihevce_hadamard_32x32_8bit(
263
    UWORD8 *pu1_src,
264
    WORD32 src_strd,
265
    UWORD8 *pu1_pred,
266
    WORD32 pred_strd,
267
    WORD16 *pi2_dst,
268
    WORD32 dst_strd)
269
161k
{
270
161k
    WORD32 i;
271
272
    // y0
273
161k
    ihevce_hadamard_16x16_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
274
    // y1
275
161k
    ihevce_hadamard_16x16_8bit(
276
161k
        pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, pi2_dst + 16, dst_strd);
277
    // y2
278
161k
    ihevce_hadamard_16x16_8bit(
279
161k
        pu1_src + 16 * src_strd,
280
161k
        src_strd,
281
161k
        pu1_pred + 16 * pred_strd,
282
161k
        pred_strd,
283
161k
        pi2_dst + (16 * dst_strd),
284
161k
        dst_strd);
285
    // y3
286
161k
    ihevce_hadamard_16x16_8bit(
287
161k
        pu1_src + 16 + 16 * src_strd,
288
161k
        src_strd,
289
161k
        pu1_pred + 16 + 16 * pred_strd,
290
161k
        pred_strd,
291
161k
        pi2_dst + (16 * dst_strd) + 16,
292
161k
        dst_strd);
293
294
    /*   Child HAD results combined as follows to get Parent result */
295
    /*  _                                                 _         */
296
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
297
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
298
    /* \-                                                 -/        */
299
41.6M
    for(i = 0; i < 256; i++)
300
41.4M
    {
301
41.4M
        WORD32 idx = (i >> 4) * dst_strd + (i % 16);
302
41.4M
        WORD16 a0 = pi2_dst[idx] >> 2;
303
41.4M
        WORD16 a1 = pi2_dst[16 + idx] >> 2;
304
41.4M
        WORD16 a2 = pi2_dst[(16 * dst_strd) + idx] >> 2;
305
41.4M
        WORD16 a3 = pi2_dst[(16 * dst_strd) + 16 + idx] >> 2;
306
307
41.4M
        WORD16 b0 = (a0 + a1);
308
41.4M
        WORD16 b1 = (a0 - a1);
309
41.4M
        WORD16 b2 = (a2 + a3);
310
41.4M
        WORD16 b3 = (a2 - a3);
311
312
41.4M
        pi2_dst[idx] = b0 + b2;
313
41.4M
        pi2_dst[16 + idx] = b1 + b3;
314
41.4M
        pi2_dst[(16 * dst_strd) + idx] = b0 - b2;
315
41.4M
        pi2_dst[(16 * dst_strd) + 16 + idx] = b1 - b3;
316
41.4M
    }
317
161k
}
318
319
/**
320
*******************************************************************************
321
*
322
* @brief
323
*  Compute Hadamard sad for 4x4 block with 8-bit input
324
*
325
* @par Description:
326
*
327
* @param[in] pu1_origin
328
*  UWORD8 pointer to the current block
329
*
330
* @param[in] src_strd
331
*  WORD32 Source stride
332
*
333
* @param[in] pu1_pred_buf
334
*  UWORD8 pointer to the prediction block
335
*
336
* @param[in] pred_strd
337
*  WORD32 Pred stride
338
*
339
* @param[in] pi2_dst
340
*  WORD16 pointer to the transform block
341
*
342
* @param[in] dst_strd
343
*  WORD32 Destination stride
344
*
345
* @param[in] size
346
*  WORD32 transform Block size
347
*
348
* @returns hadamard SAD
349
*
350
* @remarks
351
*  Not updating the transform destination now. Only returning the SATD
352
*
353
*******************************************************************************
354
*/
355
UWORD32 ihevce_HAD_4x4_8bit(
356
    UWORD8 *pu1_origin,
357
    WORD32 src_strd,
358
    UWORD8 *pu1_pred_buf,
359
    WORD32 pred_strd,
360
    WORD16 *pi2_dst,
361
    WORD32 dst_strd)
362
14.8M
{
363
14.8M
    WORD32 k;
364
14.8M
    WORD16 v[16];
365
14.8M
    UWORD32 u4_sad = 0;
366
367
14.8M
    (void)pi2_dst;
368
14.8M
    (void)dst_strd;
369
14.8M
    ihevce_hadamard_4x4_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 4);
370
371
252M
    for(k = 0; k < 16; ++k)
372
237M
        u4_sad += abs(v[k]);
373
14.8M
    u4_sad = ((u4_sad + 2) >> 2);
374
375
14.8M
    return u4_sad;
376
14.8M
}
377
378
/**
379
*******************************************************************************
380
*
381
* @brief
382
*  Computes Hadamard Sad for 8x8 block with 8-bit input
383
*
384
* @par Description:
385
*
386
* @param[in] pu1_origin
387
*  UWORD8 pointer to the current block
388
*
389
* @param[in] src_strd
390
*  WORD32 Source stride
391
*
392
* @param[in] pu1_pred_buf
393
*  UWORD8 pointer to the prediction block
394
*
395
* @param[in] pred_strd
396
*  WORD32 Pred stride
397
*
398
* @param[in] pi2_dst
399
*  WORD16 pointer to the transform block
400
*
401
* @param[in] dst_strd
402
*  WORD32 Destination stride
403
*
404
* @param[in] size
405
*  WORD32 transform Block size
406
*
407
* @returns Hadamard SAD
408
*
409
* @remarks
410
*  Not updating the transform destination now. Only returning the SATD
411
*
412
*******************************************************************************
413
*/
414
UWORD32 ihevce_HAD_8x8_8bit(
415
    UWORD8 *pu1_origin,
416
    WORD32 src_strd,
417
    UWORD8 *pu1_pred_buf,
418
    WORD32 pred_strd,
419
    WORD16 *pi2_dst,
420
    WORD32 dst_strd)
421
5.74M
{
422
5.74M
    WORD32 k;
423
5.74M
    UWORD32 u4_sad = 0;
424
5.74M
    WORD16 v[64];
425
426
5.74M
    (void)pi2_dst;
427
5.74M
    (void)dst_strd;
428
5.74M
    ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
429
430
373M
    for(k = 0; k < 64; ++k)
431
367M
        u4_sad += abs(v[k]);
432
5.74M
    u4_sad = ((u4_sad + 4) >> 3);
433
434
5.74M
    return u4_sad;
435
5.74M
}
436
437
/**
438
*******************************************************************************
439
*
440
* @brief
441
*  Compute dc suppressed hadamard sad for 8x8 block with 8-bit input
442
*
443
* @par Description:
444
*
445
* @param[in] pu1_origin
446
*  UWORD8 pointer to the current block
447
*
448
* @param[in] src_strd
449
*  WORD32 Source stride
450
*
451
* @param[in] pu1_pred_buf
452
*  UWORD8 pointer to the prediction block
453
*
454
* @param[in] pred_strd
455
*  WORD32 Pred stride
456
*
457
* @param[in] pi2_dst
458
*  WORD16 pointer to the transform block
459
*
460
* @param[in] dst_strd
461
*  WORD32 Destination stride
462
*
463
* @param[in] size
464
*  WORD32 transform Block size
465
*
466
* @returns Hadamard SAD with DC Suppressed
467
*
468
* @remarks
469
*  Not updating the transform destination now. Only returning the SATD
470
*
471
*******************************************************************************
472
*/
473
UWORD32 ihevce_compute_ac_had_8x8_8bit(
474
    UWORD8 *pu1_origin,
475
    WORD32 src_strd,
476
    UWORD8 *pu1_pred_buf,
477
    WORD32 pred_strd,
478
    WORD16 *pi2_dst,
479
    WORD32 dst_strd)
480
0
{
481
0
    WORD32 k;
482
0
    UWORD32 u4_sad = 0;
483
0
    WORD16 v[64];
484
485
0
    (void)pi2_dst;
486
0
    (void)dst_strd;
487
0
    ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
488
489
0
    v[0] = 0;
490
0
    for(k = 0; k < 64; ++k)
491
0
        u4_sad += abs(v[k]);
492
0
    u4_sad = ((u4_sad + 4) >> 3);
493
494
0
    return u4_sad;
495
0
}
496
497
/**
498
*******************************************************************************
499
*
500
* @brief
501
*  Computes Hadamard Sad for 16x16 block with 8-bit input
502
*
503
* @par Description:
504
*
505
* @param[in] pu1_origin
506
*  UWORD8 pointer to the current block
507
*
508
* @param[in] src_strd
509
*  WORD32 Source stride
510
*
511
* @param[in] pu1_pred_buf
512
*  UWORD8 pointer to the prediction block
513
*
514
* @param[in] pred_strd
515
*  WORD32 Pred stride
516
*
517
* @param[in] pi2_dst
518
*  WORD16 pointer to the transform block
519
*
520
* @param[in] dst_strd
521
*  WORD32 Destination stride
522
*
523
* @param[in] size
524
*  WORD32 transform Block size
525
*
526
* @returns Hadamard SAD
527
*
528
* @remarks
529
*  Not updating the transform destination now. Only returning the SATD
530
*
531
*******************************************************************************
532
*/
533
UWORD32 ihevce_HAD_16x16_8bit(
534
    UWORD8 *pu1_origin,
535
    WORD32 src_strd,
536
    UWORD8 *pu1_pred_buf,
537
    WORD32 pred_strd,
538
    WORD16 *pi2_dst,
539
    WORD32 dst_strd)
540
1.09M
{
541
1.09M
    WORD32 k;
542
1.09M
    UWORD32 u4_sad = 0;
543
1.09M
    WORD16 v[256];
544
545
1.09M
    (void)pi2_dst;
546
1.09M
    (void)dst_strd;
547
1.09M
    ihevce_hadamard_16x16_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 16);
548
549
280M
    for(k = 0; k < 256; ++k)
550
279M
        u4_sad += abs(v[k]);
551
1.09M
    u4_sad = ((u4_sad + 4) >> 3);
552
553
1.09M
    return u4_sad;
554
1.09M
}
555
556
/**
557
*******************************************************************************
558
*
559
* @brief
560
*  Computes Hadamard Sad for 32x32 block with 8-bit input
561
*
562
* @par Description:
563
*
564
* @param[in] pu1_origin
565
*  UWORD8 pointer to the current block
566
*
567
* @param[in] src_strd
568
*  WORD32 Source stride
569
*
570
* @param[in] pu1_pred_buf
571
*  UWORD8 pointer to the prediction block
572
*
573
* @param[in] pred_strd
574
*  WORD32 Pred stride
575
*
576
* @param[in] pi2_dst
577
*  WORD16 pointer to the transform block
578
*
579
* @param[in] dst_strd
580
*  WORD32 Destination stride
581
*
582
* @param[in] size
583
*  WORD32 transform Block size
584
*
585
* @returns Hadamard SAD
586
*
587
* @remarks
588
*  Not updating the transform destination now. Only returning the SATD
589
*
590
*******************************************************************************
591
*/
592
UWORD32 ihevce_HAD_32x32_8bit(
593
    UWORD8 *pu1_origin,
594
    WORD32 src_strd,
595
    UWORD8 *pu1_pred_buf,
596
    WORD32 pred_strd,
597
    WORD16 *pi2_dst,
598
    WORD32 dst_strd)
599
161k
{
600
161k
    WORD32 k;
601
161k
    UWORD32 u4_sad = 0;
602
161k
    WORD16 v[32 * 32];
603
604
161k
    (void)pi2_dst;
605
161k
    (void)dst_strd;
606
161k
    ihevce_hadamard_32x32_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 32);
607
608
165M
    for(k = 0; k < 32 * 32; ++k)
609
165M
        u4_sad += abs(v[k]);
610
161k
    u4_sad = ((u4_sad + 2) >> 2);
611
612
161k
    return u4_sad;
613
161k
}
614
615
//#if COMPUTE_16x16_R == C
616
/**
617
*******************************************************************************
618
*
619
* @brief
620
*   Computes 8x8 transform using children 4x4 hadamard results
621
*
622
* @par Description:
623
*
624
* @param[in] pi2_4x4_had
625
*  WORD16 pointer to 4x4 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
626
*
627
* @param[in] had4_strd
628
*  stride of 4x4 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
629
*
630
* @param[out] pi2_dst
631
*  destination buffer where 8x8 hadamard result is stored
632
*
633
* @param[in] dst_stride
634
*  stride of destination block
635
*
636
* @param[in] i4_frm_qstep
637
*  frm_qstep value based on the which the threshold value is calculated
638
*
639
* @returns
640
*  8x8 Hadamard SATD
641
* @remarks
642
*
643
*******************************************************************************
644
*/
645
static UWORD32 ihevce_compute_8x8HAD_using_4x4(
646
    WORD16 *pi2_4x4_had,
647
    WORD32 had4_strd,
648
    WORD16 *pi2_dst,
649
    WORD32 dst_strd,
650
    WORD32 i4_frm_qstep,
651
    WORD32 *pi4_cbf)
652
1.65M
{
653
    /* Qstep value is right shifted by 8 */
654
1.65M
    WORD32 threshold = (i4_frm_qstep >> 8);
655
656
    /* Initialize pointers to 4 subblocks of 4x4 HAD buffer */
657
1.65M
    WORD16 *pi2_y0 = pi2_4x4_had;
658
1.65M
    WORD16 *pi2_y1 = pi2_4x4_had + 4;
659
1.65M
    WORD16 *pi2_y2 = pi2_4x4_had + had4_strd * 4;
660
1.65M
    WORD16 *pi2_y3 = pi2_4x4_had + had4_strd * 4 + 4;
661
662
    /* Initialize pointers to store 8x8 HAD output */
663
1.65M
    WORD16 *pi2_dst0 = pi2_dst;
664
1.65M
    WORD16 *pi2_dst1 = pi2_dst + 4;
665
1.65M
    WORD16 *pi2_dst2 = pi2_dst + dst_strd * 4;
666
1.65M
    WORD16 *pi2_dst3 = pi2_dst + dst_strd * 4 + 4;
667
668
1.65M
    UWORD32 u4_satd = 0;
669
1.65M
    WORD32 i;
670
671
    /*   Child HAD results combined as follows to get Parent result */
672
    /*  _                                                 _         */
673
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
674
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
675
    /* \-                                                 -/        */
676
28.1M
    for(i = 0; i < 16; i++)
677
26.5M
    {
678
26.5M
        WORD32 src_idx = (i >> 2) * had4_strd + (i % 4);
679
26.5M
        WORD32 dst_idx = (i >> 2) * dst_strd + (i % 4);
680
681
26.5M
        WORD16 a0 = pi2_y0[src_idx];
682
26.5M
        WORD16 a1 = pi2_y1[src_idx];
683
26.5M
        WORD16 a2 = pi2_y2[src_idx];
684
26.5M
        WORD16 a3 = pi2_y3[src_idx];
685
686
26.5M
        WORD16 b0 = (a0 + a1);
687
26.5M
        WORD16 b1 = (a0 - a1);
688
26.5M
        WORD16 b2 = (a2 + a3);
689
26.5M
        WORD16 b3 = (a2 - a3);
690
691
26.5M
        pi2_dst0[dst_idx] = b0 + b2;
692
26.5M
        pi2_dst1[dst_idx] = b1 + b3;
693
26.5M
        pi2_dst2[dst_idx] = b0 - b2;
694
26.5M
        pi2_dst3[dst_idx] = b1 - b3;
695
696
26.5M
        if(ABS(pi2_dst0[dst_idx]) > threshold)
697
5.71M
            *pi4_cbf = 1;
698
26.5M
        if(ABS(pi2_dst1[dst_idx]) > threshold)
699
5.45M
            *pi4_cbf = 1;
700
26.5M
        if(ABS(pi2_dst2[dst_idx]) > threshold)
701
5.61M
            *pi4_cbf = 1;
702
26.5M
        if(ABS(pi2_dst3[dst_idx]) > threshold)
703
5.41M
            *pi4_cbf = 1;
704
705
26.5M
        u4_satd += ABS(pi2_dst0[dst_idx]);
706
26.5M
        u4_satd += ABS(pi2_dst1[dst_idx]);
707
26.5M
        u4_satd += ABS(pi2_dst2[dst_idx]);
708
26.5M
        u4_satd += ABS(pi2_dst3[dst_idx]);
709
26.5M
    }
710
711
    /* return the 8x8 satd */
712
1.65M
    return (u4_satd);
713
1.65M
}
714
715
/**
716
*******************************************************************************
717
*
718
* @brief
719
*    Computes Residue and Hadamard Transform for four 4x4 blocks (Z scan) of
720
*    a 8x8 block (Residue is computed for 8-bit src and prediction buffers)
721
*    Modified to incorporate the dead-zone implementation - Lokesh
722
*
723
* @par Description:
724
*
725
* @param[in] pu1_origin
726
*  UWORD8 pointer to the current block
727
*
728
* @param[in] src_strd
729
*  WORD32 Source stride
730
*
731
* @param[in] pu1_pred
732
*  UWORD8 pointer to the prediction block
733
*
734
* @param[in] pred_strd
735
*  WORD32 Pred stride
736
*
737
* @param[out] pi2_dst
738
*  WORD16 pointer to the transform block
739
*
740
* @param[in] dst_strd
741
*  WORD32 Destination stride
742
*
743
* @param[out] pi4_hsad
744
*  array for storing hadmard sad of each 4x4 block
745
*
746
* @param[in] hsad_stride
747
*  stride of hadmard sad destination buffer (for Zscan order of storing sads)
748
*
749
* @param[in] i4_frm_qstep
750
*  frm_qstep value based on the which the threshold value is calculated
751
*
752
* @returns
753
*
754
* @remarks
755
*
756
*******************************************************************************
757
*/
758
static WORD32 ihevce_had4_4x4(
759
    UWORD8 *pu1_src,
760
    WORD32 src_strd,
761
    UWORD8 *pu1_pred,
762
    WORD32 pred_strd,
763
    WORD16 *pi2_dst4x4,
764
    WORD32 dst_strd,
765
    WORD32 *pi4_hsad,
766
    WORD32 hsad_stride,
767
    WORD32 i4_frm_qstep)
768
8.51M
{
769
8.51M
    WORD32 i, k;
770
8.51M
    WORD32 i4_child_total_sad = 0;
771
772
8.51M
    (void)i4_frm_qstep;
773
    /* -------- Compute four 4x4 HAD Transforms ---------*/
774
42.5M
    for(i = 0; i < 4; i++)
775
34.0M
    {
776
34.0M
        UWORD8 *pu1_pi0, *pu1_pi1;
777
34.0M
        WORD16 *pi2_dst;
778
34.0M
        WORD32 blkx, blky;
779
34.0M
        UWORD32 u4_hsad = 0;
780
        // TODO: choose deadzone as f(qstep)
781
34.0M
        WORD32 threshold = 0;
782
783
        /*****************************************************/
784
        /*    Assuming the looping structure of the four     */
785
        /*    blocks is in Z scan order of 4x4s in a 8x8     */
786
        /*    block instead of raster scan                   */
787
        /*****************************************************/
788
34.0M
        blkx = (i & 0x1);
789
34.0M
        blky = (i >> 1);
790
791
34.0M
        pu1_pi0 = pu1_src + (blkx * 4) + (blky * 4 * src_strd);
792
34.0M
        pu1_pi1 = pu1_pred + (blkx * 4) + (blky * 4 * pred_strd);
793
34.0M
        pi2_dst = pi2_dst4x4 + (blkx * 4) + (blky * 4 * dst_strd);
794
795
34.0M
        ihevce_hadamard_4x4_8bit(pu1_pi0, src_strd, pu1_pi1, pred_strd, pi2_dst, dst_strd);
796
797
170M
        for(k = 0; k < 4; k++)
798
136M
        {
799
136M
            if(ABS(pi2_dst[0 * dst_strd + k]) < threshold)
800
0
                pi2_dst[0 * dst_strd + k] = 0;
801
802
136M
            if(ABS(pi2_dst[1 * dst_strd + k]) < threshold)
803
0
                pi2_dst[1 * dst_strd + k] = 0;
804
805
136M
            if(ABS(pi2_dst[2 * dst_strd + k]) < threshold)
806
0
                pi2_dst[2 * dst_strd + k] = 0;
807
808
136M
            if(ABS(pi2_dst[3 * dst_strd + k]) < threshold)
809
0
                pi2_dst[3 * dst_strd + k] = 0;
810
811
            /* Accumulate the SATD */
812
136M
            u4_hsad += ABS(pi2_dst[0 * dst_strd + k]);
813
136M
            u4_hsad += ABS(pi2_dst[1 * dst_strd + k]);
814
136M
            u4_hsad += ABS(pi2_dst[2 * dst_strd + k]);
815
136M
            u4_hsad += ABS(pi2_dst[3 * dst_strd + k]);
816
136M
        }
817
818
        /*===== Normalize the HSAD =====*/
819
34.0M
        pi4_hsad[blkx + (blky * hsad_stride)] = ((u4_hsad + 2) >> 2);
820
34.0M
        i4_child_total_sad += ((u4_hsad + 2) >> 2);
821
34.0M
    }
822
8.51M
    return i4_child_total_sad;
823
8.51M
}
824
825
/**
826
*******************************************************************************
827
*
828
* @brief
829
*    HSAD is returned for the 4, 4x4 in 8x8
830
*
831
* @par Description:
832
*
833
* @param[in] pu1_origin
834
*  UWORD8 pointer to the current block
835
*
836
* @param[in] src_strd
837
*  WORD32 Source stride
838
*
839
* @param[in] pu1_pred
840
*  UWORD8 pointer to the prediction block
841
*
842
* @param[in] pred_strd
843
*  WORD32 Pred stride
844
*
845
* @param[out] pi2_dst
846
*  WORD16 pointer to the transform output block
847
*
848
* @param[out] dst_strd
849
*  WORD32 Destination stride
850
*
851
* @param[out] ppi4_hsad
852
*   pointer to base pointers for storing hadmard sads of various
853
*   block sizes (4x4 to 32x32)
854
*
855
* @param[in] pos_x_y_4x4
856
*   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
857
*   Lower 16bits denote xpos and upper 16ypos of the 4x4block
858
*
859
* @param[in] num_4x4_in_row
860
*   Denotes the number of current 4x4 blocks in a ctb/CU/MB
861
*
862
* @returns
863
*
864
* @remarks
865
*
866
*******************************************************************************
867
*/
868
void ihevce_had_8x8_using_4_4x4(
869
    UWORD8 *pu1_src,
870
    WORD32 src_strd,
871
    UWORD8 *pu1_pred,
872
    WORD32 pred_strd,
873
    WORD16 *pi2_dst,
874
    WORD32 dst_strd,
875
    WORD32 **ppi4_hsad,
876
    WORD32 pos_x_y_4x4,
877
    WORD32 num_4x4_in_row)
878
6.86M
{
879
6.86M
    WORD16 ai2_4x4_had[64];
880
6.86M
    WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
881
6.86M
    WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
882
6.86M
    WORD32 *pi4_4x4_hsad;
883
6.86M
    WORD32 *pi4_8x8_hsad;
884
885
6.86M
    (void)pi2_dst;
886
6.86M
    (void)dst_strd;
887
6.86M
    ASSERT(pos_x >= 0);
888
6.86M
    ASSERT(pos_y >= 0);
889
890
    /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
891
6.86M
    pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
892
6.86M
    pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
893
894
    /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
895
6.86M
    pi4_8x8_hsad[0] = ihevce_had4_4x4(
896
6.86M
        pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
897
6.86M
}
898
899
/**
900
*******************************************************************************
901
*
902
* @brief
903
*    Reursive Hadamard Transform for 8x8 block. HSAD is returned for the 8x8
904
*    block and its four subblocks(4x4).
905
*
906
* @par Description:
907
*
908
* @param[in] pu1_origin
909
*  UWORD8 pointer to the current block
910
*
911
* @param[in] src_strd
912
*  WORD32 Source stride
913
*
914
* @param[in] pu1_pred
915
*  UWORD8 pointer to the prediction block
916
*
917
* @param[in] pred_strd
918
*  WORD32 Pred stride
919
*
920
* @param[out] pi2_dst
921
*  WORD16 pointer to the transform output block
922
*
923
* @param[out] dst_strd
924
*  WORD32 Destination stride
925
*
926
* @param[out] ppi4_hsad
927
*   pointer to base pointers for storing hadmard sads of various
928
*   block sizes (4x4 to 32x32)
929
*
930
* @param[in] pos_x_y_4x4
931
*   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
932
*   Lower 16bits denote xpos and upper 16ypos of the 4x4block
933
*
934
* @param[in] num_4x4_in_row
935
*   Denotes the number of current 4x4 blocks in a ctb/CU/MB
936
*
937
* @param[in] i4_frm_qstep
938
*  frm_qstep value based on the which the threshold value is calculated
939
*
940
* @returns
941
*
942
* @remarks
943
*
944
*******************************************************************************
945
*/
946
WORD32 ihevce_had_8x8_using_4_4x4_r(
947
    UWORD8 *pu1_src,
948
    WORD32 src_strd,
949
    UWORD8 *pu1_pred,
950
    WORD32 pred_strd,
951
    WORD16 *pi2_dst,
952
    WORD32 dst_strd,
953
    WORD32 **ppi4_hsad,
954
    WORD32 **ppi4_tu_split,
955
    WORD32 **ppi4_tu_early_cbf,
956
    WORD32 pos_x_y_4x4,
957
    WORD32 num_4x4_in_row,
958
    WORD32 lambda,
959
    WORD32 lambda_q_shift,
960
    WORD32 i4_frm_qstep,
961
    WORD32 i4_cur_depth,
962
    WORD32 i4_max_depth,
963
    WORD32 i4_max_tr_size,
964
    WORD32 *pi4_tu_split_cost,
965
    void *pv_func_sel)
966
1.65M
{
967
1.65M
    WORD16 ai2_4x4_had[64];
968
1.65M
    WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
969
1.65M
    WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
970
1.65M
    WORD32 *pi4_4x4_hsad;
971
1.65M
    WORD32 *pi4_8x8_hsad;
972
1.65M
    WORD32 *pi4_8x8_tu_split;
973
974
1.65M
    WORD32 *pi4_8x8_tu_early_cbf;
975
976
1.65M
    UWORD32 u4_satd;
977
1.65M
    WORD32 cost_child = 0, cost_parent = 0;
978
1.65M
    WORD32 early_cbf = 0;
979
980
1.65M
    const UWORD8 u1_cur_tr_size = 8;
981
    /* Stores the best cost for the Current 8x8: Lokesh */
982
1.65M
    WORD32 best_cost = 0;
983
984
1.65M
    (void)pv_func_sel;
985
1.65M
    ASSERT(pos_x >= 0);
986
1.65M
    ASSERT(pos_y >= 0);
987
988
    /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
989
1.65M
    pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
990
1.65M
    pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
991
1.65M
    pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
992
1.65M
    pi4_8x8_tu_early_cbf =
993
1.65M
        ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
994
995
    /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
996
1.65M
    cost_child = ihevce_had4_4x4(
997
1.65M
        pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
998
999
    /* -------- Compute 8x8 HAD Transform using 4x4 results ------------- */
1000
1.65M
    u4_satd = ihevce_compute_8x8HAD_using_4x4(
1001
1.65M
        ai2_4x4_had, 8, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1002
1003
    /* store the normalized 8x8 satd */
1004
1.65M
    cost_parent = ((u4_satd + 4) >> 3);
1005
1006
    /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1007
1.65M
    cost_child += ((4) * lambda) >> (lambda_q_shift + 1);
1008
1009
1.65M
    if(i4_cur_depth < i4_max_depth)
1010
774k
    {
1011
774k
        if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1012
175k
        {
1013
            //cost_child -= ((4) * lambda) >> (lambda_q_shift + 1);
1014
175k
            *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1);
1015
175k
            best_cost = cost_child;
1016
175k
            best_cost <<= 1;
1017
175k
            best_cost++;
1018
175k
            pi4_8x8_tu_split[0] = 1;
1019
175k
            pi4_8x8_hsad[0] = cost_child;
1020
175k
        }
1021
598k
        else
1022
598k
        {
1023
            //cost_parent -= ((1) * lambda) >>  (lambda_q_shift + 1);
1024
598k
            best_cost = cost_parent;
1025
598k
            best_cost <<= 1;
1026
598k
            pi4_8x8_tu_split[0] = 0;
1027
598k
            pi4_8x8_hsad[0] = cost_parent;
1028
598k
        }
1029
774k
    }
1030
882k
    else
1031
882k
    {
1032
        //cost_parent -= ((1) * lambda) >>  (lambda_q_shift + 1);
1033
882k
        best_cost = cost_parent;
1034
882k
        best_cost <<= 1;
1035
882k
        pi4_8x8_tu_split[0] = 0;
1036
882k
        pi4_8x8_hsad[0] = cost_parent;
1037
882k
    }
1038
1039
1.65M
    pi4_8x8_tu_early_cbf[0] = early_cbf;
1040
1041
    /* best cost has tu_split_flag at LSB(Least significant bit) */
1042
1.65M
    return ((best_cost << 1) + early_cbf);
1043
1.65M
}
1044
1045
/**
1046
*******************************************************************************
1047
*
1048
* @brief
1049
*   Computes 16x16 transform using children 8x8 hadamard results
1050
*    Modified to incorporate the dead-zone implementation - Lokesh
1051
*
1052
* @par Description:
1053
*
1054
* @param[in] pi2_8x8_had
1055
*  WORD16 pointer to 8x8 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
1056
*
1057
* @param[in] had8_strd
1058
*  stride of 8x8 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
1059
*
1060
* @param[out] pi2_dst
1061
*  destination buffer where 8x8 hadamard result is stored
1062
*
1063
* @param[in] dst_stride
1064
*  stride of destination block
1065
*
1066
* @param[in] i4_frm_qstep
1067
*  frm_qstep value based on the which the threshold value is calculated
1068
*
1069
* @returns
1070
*  16x16 Hadamard SATD
1071
* @remarks
1072
*
1073
*******************************************************************************
1074
*/
1075
static UWORD32 ihevce_compute_16x16HAD_using_8x8(
1076
    WORD16 *pi2_8x8_had,
1077
    WORD32 had8_strd,
1078
    WORD16 *pi2_dst,
1079
    WORD32 dst_strd,
1080
    WORD32 i4_frm_qstep,
1081
    WORD32 *pi4_cbf)
1082
377k
{
1083
    /* Qstep value is right shifted by 8 */
1084
377k
    WORD32 threshold = (i4_frm_qstep >> 8);
1085
1086
    /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
1087
377k
    WORD16 *pi2_y0 = pi2_8x8_had;
1088
377k
    WORD16 *pi2_y1 = pi2_8x8_had + 8;
1089
377k
    WORD16 *pi2_y2 = pi2_8x8_had + had8_strd * 8;
1090
377k
    WORD16 *pi2_y3 = pi2_8x8_had + had8_strd * 8 + 8;
1091
1092
    /* Initialize pointers to store 8x8 HAD output */
1093
377k
    WORD16 *pi2_dst0 = pi2_dst;
1094
377k
    WORD16 *pi2_dst1 = pi2_dst + 8;
1095
377k
    WORD16 *pi2_dst2 = pi2_dst + dst_strd * 8;
1096
377k
    WORD16 *pi2_dst3 = pi2_dst + dst_strd * 8 + 8;
1097
1098
377k
    UWORD32 u4_satd = 0;
1099
377k
    WORD32 i;
1100
1101
    /*   Child HAD results combined as follows to get Parent result */
1102
    /*  _                                                 _         */
1103
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
1104
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
1105
    /* \-                                                 -/        */
1106
24.5M
    for(i = 0; i < 64; i++)
1107
24.1M
    {
1108
24.1M
        WORD32 src_idx = (i >> 3) * had8_strd + (i % 8);
1109
24.1M
        WORD32 dst_idx = (i >> 3) * dst_strd + (i % 8);
1110
1111
24.1M
        WORD16 a0 = pi2_y0[src_idx];
1112
24.1M
        WORD16 a1 = pi2_y1[src_idx];
1113
24.1M
        WORD16 a2 = pi2_y2[src_idx];
1114
24.1M
        WORD16 a3 = pi2_y3[src_idx];
1115
1116
24.1M
        WORD16 b0 = (a0 + a1) >> 1;
1117
24.1M
        WORD16 b1 = (a0 - a1) >> 1;
1118
24.1M
        WORD16 b2 = (a2 + a3) >> 1;
1119
24.1M
        WORD16 b3 = (a2 - a3) >> 1;
1120
1121
24.1M
        pi2_dst0[dst_idx] = b0 + b2;
1122
24.1M
        pi2_dst1[dst_idx] = b1 + b3;
1123
24.1M
        pi2_dst2[dst_idx] = b0 - b2;
1124
24.1M
        pi2_dst3[dst_idx] = b1 - b3;
1125
1126
        /* Make the value of dst to zerp, if it falls below the dead-zone */
1127
24.1M
        if(ABS(pi2_dst0[dst_idx]) > threshold)
1128
5.73M
            *pi4_cbf = 1;
1129
24.1M
        if(ABS(pi2_dst1[dst_idx]) > threshold)
1130
5.67M
            *pi4_cbf = 1;
1131
24.1M
        if(ABS(pi2_dst2[dst_idx]) > threshold)
1132
5.69M
            *pi4_cbf = 1;
1133
24.1M
        if(ABS(pi2_dst3[dst_idx]) > threshold)
1134
5.65M
            *pi4_cbf = 1;
1135
1136
24.1M
        u4_satd += ABS(pi2_dst0[dst_idx]);
1137
24.1M
        u4_satd += ABS(pi2_dst1[dst_idx]);
1138
24.1M
        u4_satd += ABS(pi2_dst2[dst_idx]);
1139
24.1M
        u4_satd += ABS(pi2_dst3[dst_idx]);
1140
24.1M
    }
1141
1142
    /* return 16x16 satd */
1143
377k
    return (u4_satd);
1144
377k
}
1145
1146
/**
1147
*******************************************************************************
1148
*
1149
* @brief
1150
*    Hadamard Transform for 16x16 block with 8x8 and 4x4 SATD updates.
1151
*    Uses recursive 8x8 had output to compute satd for 16x16 and its children
1152
*
1153
* @par Description:
1154
*
1155
* @param[in] pu1_origin
1156
*  UWORD8 pointer to the current block
1157
*
1158
* @param[in] src_strd
1159
*  WORD32 Source stride
1160
*
1161
* @param[in] pu1_pred
1162
*  UWORD8 pointer to the prediction block
1163
*
1164
* @param[in] pred_strd
1165
*  WORD32 Pred stride
1166
*
1167
* @param[out] pi2_dst
1168
*  WORD16 pointer to the transform output block
1169
*
1170
* @param[out] dst_strd
1171
*  WORD32 Destination stride
1172
*
1173
* @param[out] ppi4_hsad
1174
*   pointer to base pointers for storing hadmard sads of various
1175
*   block sizes (4x4 to 32x32)
1176
*
1177
* @param[in] pos_x_y_4x4
1178
*   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
1179
*   Lower 16bits denote xpos and upper 16ypos of the 4x4block
1180
*
1181
* @param[in] num_4x4_in_row
1182
*   Denotes the number of current 4x4 blocks in a ctb/CU/MB
1183
*
1184
* @param[in] lambda
1185
*  lambda values is the cost factor calculated based on QP
1186
*
1187
* @param[in] lambda_q_shift
1188
*  lambda_q_shift used to reverse the lambda value back from q8 format
1189
*
1190
* @param[in] depth
1191
*  depth gives the current TU depth with respect to the CU
1192
*
1193
* @param[in] i4_frm_qstep
1194
*  frm_qstep value based on the which the threshold value is calculated
1195
*
1196
* @returns
1197
*
1198
* @remarks
1199
*
1200
*******************************************************************************
1201
*/
1202
1203
WORD32 ihevce_had_16x16_r(
1204
    UWORD8 *pu1_src,
1205
    WORD32 src_strd,
1206
    UWORD8 *pu1_pred,
1207
    WORD32 pred_strd,
1208
    WORD16 *pi2_dst,
1209
    WORD32 dst_strd,
1210
    WORD32 **ppi4_hsad,
1211
    WORD32 **ppi4_tu_split,
1212
    WORD32 **ppi4_tu_early_cbf,
1213
    WORD32 pos_x_y_4x4,
1214
    WORD32 num_4x4_in_row,
1215
    WORD32 lambda,
1216
    WORD32 lambda_q_shift,
1217
    WORD32 i4_frm_qstep,
1218
    WORD32 i4_cur_depth,
1219
    WORD32 i4_max_depth,
1220
    WORD32 i4_max_tr_size,
1221
    WORD32 *pi4_tu_split_cost,
1222
    void *pv_func_sel)
1223
377k
{
1224
377k
    WORD16 ai2_8x8_had[256];
1225
377k
    WORD32 *pi4_16x16_hsad;
1226
377k
    WORD32 *pi4_16x16_tu_split;
1227
1228
377k
    WORD32 *pi4_16x16_tu_early_cbf;
1229
1230
377k
    UWORD32 u4_satd = 0;
1231
377k
    WORD32 tu_split_flag = 0;
1232
377k
    WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1233
377k
    const UWORD8 u1_cur_tr_size = 16;
1234
1235
    /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
1236
    /* cost_child : Stores the cost of the child HAD transform (16x16) */
1237
377k
    WORD32 cost_parent = 0, cost_child = 0;
1238
1239
    /*best_cost returns the best cost at the end of the function */
1240
    /*tu_split denoes whether the TU (16x16)is split or not */
1241
377k
    WORD32 best_cost = 0, best_cost_tu_split;
1242
377k
    WORD32 i;
1243
1244
377k
    WORD16 *pi2_y0;
1245
377k
    UWORD8 *pu1_src0;
1246
377k
    UWORD8 *pu1_pred0;
1247
377k
    WORD32 pos_x_y_4x4_0;
1248
1249
377k
    WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1250
377k
    WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1251
1252
377k
    ASSERT(pos_x >= 0);
1253
377k
    ASSERT(pos_y >= 0);
1254
1255
    /* Initialize pointers to  store 16x16 SATDs */
1256
377k
    pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1257
1258
377k
    pi4_16x16_tu_split =
1259
377k
        ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1260
1261
377k
    pi4_16x16_tu_early_cbf =
1262
377k
        ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1263
1264
    /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1265
1.88M
    for(i = 0; i < 4; i++)
1266
1.50M
    {
1267
1.50M
        pu1_src0 = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8;
1268
1.50M
        pu1_pred0 = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8;
1269
1.50M
        pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1270
1.50M
        pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1271
1272
1.50M
        best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r(
1273
1.50M
            pu1_src0,
1274
1.50M
            src_strd,
1275
1.50M
            pu1_pred0,
1276
1.50M
            pred_strd,
1277
1.50M
            pi2_y0,
1278
1.50M
            16,
1279
1.50M
            ppi4_hsad,
1280
1.50M
            ppi4_tu_split,
1281
1.50M
            ppi4_tu_early_cbf,
1282
1.50M
            pos_x_y_4x4_0,
1283
1.50M
            num_4x4_in_row,
1284
1.50M
            lambda,
1285
1.50M
            lambda_q_shift,
1286
1.50M
            i4_frm_qstep,
1287
1.50M
            i4_cur_depth + 1,
1288
1.50M
            i4_max_depth,
1289
1.50M
            i4_max_tr_size,
1290
1.50M
            pi4_tu_split_cost,
1291
1.50M
            pv_func_sel);
1292
1293
        /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */
1294
1.50M
        best_cost = (best_cost_tu_split >> 2);
1295
1296
        /* Last but one bit stores the information regarding the TU_Split */
1297
1.50M
        tu_split_flag += (best_cost_tu_split & 0x3) >> 1;
1298
1299
        /* Last bit stores the information regarding the early_cbf */
1300
1.50M
        i4_early_cbf_flag += (best_cost_tu_split & 0x1);
1301
1302
1.50M
        cost_child += best_cost;
1303
1304
1.50M
        tu_split_flag <<= 1;
1305
1.50M
        i4_early_cbf_flag <<= 1;
1306
1.50M
    }
1307
1308
    /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */
1309
377k
    pi2_y0 = ai2_8x8_had;
1310
1311
    /* Threshold currently passed as "0" */
1312
377k
    u4_satd =
1313
377k
        ihevce_compute_16x16HAD_using_8x8(pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1314
1315
    /* store the normalized satd */
1316
377k
    cost_parent = ((u4_satd + 4) >> 3);
1317
1318
    /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1319
377k
    cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1320
1321
377k
    i4_early_cbf_flag += early_cbf;
1322
1323
    /* Right now the depth is hard-coded to 4: The depth can be modified from the config file
1324
    which decides the extent to which TU_REC needs to be done */
1325
377k
    if(i4_cur_depth < i4_max_depth)
1326
285k
    {
1327
285k
        if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1328
80.5k
        {
1329
            //cost_child -= ((4 + 4)  * lambda) >> (lambda_q_shift + 1);
1330
80.5k
            *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1331
80.5k
            tu_split_flag += 1;
1332
80.5k
            best_cost = cost_child;
1333
80.5k
        }
1334
204k
        else
1335
204k
        {
1336
            //cost_parent -= ((1 + 1) * lambda) >>  (lambda_q_shift + 1);
1337
204k
            tu_split_flag += 0;
1338
204k
            best_cost = cost_parent;
1339
204k
        }
1340
285k
    }
1341
92.0k
    else
1342
92.0k
    {
1343
        //cost_parent -= ((1 + 1) * lambda) >>  (lambda_q_shift + 1);
1344
92.0k
        tu_split_flag += 0;
1345
92.0k
        best_cost = cost_parent;
1346
92.0k
    }
1347
1348
377k
    pi4_16x16_hsad[0] = best_cost;
1349
377k
    pi4_16x16_tu_split[0] = tu_split_flag;
1350
377k
    pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag;
1351
1352
    /*returning two values(best cost & tu_split_flag) as a single value*/
1353
377k
    return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag);
1354
377k
}
1355
1356
//#endif
1357
/**
1358
*******************************************************************************
1359
*
1360
* @brief
1361
*   Computes 32x32 transform using children 16x16 hadamard results
1362
*
1363
* @par Description:
1364
*
1365
* @param[in] pi2_16x16_had
1366
*  WORD16 pointer to 16x16 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
1367
*
1368
* @param[in] had16_strd
1369
*  stride of 16x16 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
1370
*
1371
* @param[out] pi2_dst
1372
*  destination buffer where 16x16 hadamard result is stored
1373
*
1374
* @param[in] dst_stride
1375
*  stride of destination block
1376
*
1377
* @param[in] i4_frm_qstep
1378
*  frm_qstep value based on the which the threshold value is calculated
1379
*
1380
* @returns
1381
*  32x32 Hadamard SATD
1382
* @remarks
1383
*
1384
*******************************************************************************
1385
*/
1386
//#if COMPUTE_32x32_USING_16X16 == C
1387
UWORD32 ihevce_compute_32x32HAD_using_16x16(
1388
    WORD16 *pi2_16x16_had,
1389
    WORD32 had16_strd,
1390
    WORD16 *pi2_dst,
1391
    WORD32 dst_strd,
1392
    WORD32 i4_frm_qstep,
1393
    WORD32 *pi4_cbf)
1394
52.6k
{
1395
    /* Qstep value is right shifted by 8 */
1396
52.6k
    WORD32 threshold = (i4_frm_qstep >> 8);
1397
1398
    /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
1399
52.6k
    WORD16 *pi2_y0 = pi2_16x16_had;
1400
52.6k
    WORD16 *pi2_y1 = pi2_16x16_had + 16;
1401
52.6k
    WORD16 *pi2_y2 = pi2_16x16_had + had16_strd * 16;
1402
52.6k
    WORD16 *pi2_y3 = pi2_16x16_had + had16_strd * 16 + 16;
1403
1404
    /* Initialize pointers to store 8x8 HAD output */
1405
52.6k
    WORD16 *pi2_dst0 = pi2_dst;
1406
52.6k
    WORD16 *pi2_dst1 = pi2_dst + 16;
1407
52.6k
    WORD16 *pi2_dst2 = pi2_dst + dst_strd * 16;
1408
52.6k
    WORD16 *pi2_dst3 = pi2_dst + dst_strd * 16 + 16;
1409
1410
52.6k
    UWORD32 u4_satd = 0;
1411
52.6k
    WORD32 i;
1412
1413
    /*   Child HAD results combined as follows to get Parent result */
1414
    /*  _                                                 _         */
1415
    /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
1416
    /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
1417
    /* \-                                                 -/        */
1418
13.5M
    for(i = 0; i < 256; i++)
1419
13.4M
    {
1420
13.4M
        WORD32 src_idx = (i >> 4) * had16_strd + (i % 16);
1421
13.4M
        WORD32 dst_idx = (i >> 4) * dst_strd + (i % 16);
1422
1423
13.4M
        WORD16 a0 = pi2_y0[src_idx] >> 2;
1424
13.4M
        WORD16 a1 = pi2_y1[src_idx] >> 2;
1425
13.4M
        WORD16 a2 = pi2_y2[src_idx] >> 2;
1426
13.4M
        WORD16 a3 = pi2_y3[src_idx] >> 2;
1427
1428
13.4M
        WORD16 b0 = (a0 + a1);
1429
13.4M
        WORD16 b1 = (a0 - a1);
1430
13.4M
        WORD16 b2 = (a2 + a3);
1431
13.4M
        WORD16 b3 = (a2 - a3);
1432
1433
13.4M
        pi2_dst0[dst_idx] = b0 + b2;
1434
13.4M
        pi2_dst1[dst_idx] = b1 + b3;
1435
13.4M
        pi2_dst2[dst_idx] = b0 - b2;
1436
13.4M
        pi2_dst3[dst_idx] = b1 - b3;
1437
1438
        /* Make the value of dst to zerp, if it falls below the dead-zone */
1439
13.4M
        if(ABS(pi2_dst0[dst_idx]) > threshold)
1440
2.60M
            *pi4_cbf = 1;
1441
13.4M
        if(ABS(pi2_dst1[dst_idx]) > threshold)
1442
2.57M
            *pi4_cbf = 1;
1443
13.4M
        if(ABS(pi2_dst2[dst_idx]) > threshold)
1444
2.59M
            *pi4_cbf = 1;
1445
13.4M
        if(ABS(pi2_dst3[dst_idx]) > threshold)
1446
2.56M
            *pi4_cbf = 1;
1447
1448
13.4M
        u4_satd += ABS(pi2_dst0[dst_idx]);
1449
13.4M
        u4_satd += ABS(pi2_dst1[dst_idx]);
1450
13.4M
        u4_satd += ABS(pi2_dst2[dst_idx]);
1451
13.4M
        u4_satd += ABS(pi2_dst3[dst_idx]);
1452
13.4M
    }
1453
1454
    /* return 32x32 satd */
1455
52.6k
    return (u4_satd);
1456
52.6k
}
1457
//#endif
1458
1459
/**
1460
*******************************************************************************
1461
*
1462
* @brief
1463
*    Hadamard Transform for 32x32 block with 16x6, 8x8 and 4x4 SATD updates.
1464
*    Uses recursive 16x16 had output to compute satd for 32x32 and its children
1465
*
1466
* @par Description:
1467
*
1468
* @param[in] pu1_origin
1469
*  UWORD8 pointer to the current block
1470
*
1471
* @param[in] src_strd
1472
*  WORD32 Source stride
1473
*
1474
* @param[in] pu1_pred
1475
*  UWORD8 pointer to the prediction block
1476
*
1477
* @param[in] pred_strd
1478
*  WORD32 Pred stride
1479
*
1480
* @param[out] pi2_dst
1481
*  WORD16 pointer to the transform output block
1482
*
1483
* @param[out] dst_strd
1484
*  WORD32 Destination stride
1485
*
1486
* @param[out] ppi4_hsad
1487
*   pointer to base pointers for storing hadmard sads of various
1488
*   block sizes (4x4 to 32x32)
1489
*
1490
* @param[in] pos_x_y_4x4
1491
*   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
1492
*   Lower 16bits denote xpos and upper 16ypos of the 4x4block
1493
*
1494
* @param[in] num_4x4_in_row
1495
*   Denotes the number of current 4x4 blocks in a ctb/CU/MB
1496
*
1497
* @param[in] lambda
1498
*  lambda values is the cost factor calculated based on QP
1499
*
1500
* @param[in] lambda_q_shift
1501
*  lambda_q_shift used to reverse the lambda value back from q8 format
1502
*
1503
* @param[in] depth
1504
*  depth gives the current TU depth with respect to the CU
1505
*
1506
* @param[in] i4_frm_qstep
1507
*  frm_qstep value based on the which the threshold value is calculated
1508
*
1509
*
1510
* @returns
1511
*
1512
* @remarks
1513
*
1514
*******************************************************************************
1515
*/
1516
void ihevce_had_32x32_r(
1517
    UWORD8 *pu1_src,
1518
    WORD32 src_strd,
1519
    UWORD8 *pu1_pred,
1520
    WORD32 pred_strd,
1521
    WORD16 *pi2_dst,
1522
    WORD32 dst_strd,
1523
    WORD32 **ppi4_hsad,
1524
    WORD32 **ppi4_tu_split,
1525
    WORD32 **ppi4_tu_early_cbf,
1526
    WORD32 pos_x_y_4x4,
1527
    WORD32 num_4x4_in_row,
1528
    WORD32 lambda,
1529
    WORD32 lambda_q_shift,
1530
    WORD32 i4_frm_qstep,
1531
    WORD32 i4_cur_depth,
1532
    WORD32 i4_max_depth,
1533
    WORD32 i4_max_tr_size,
1534
    WORD32 *pi4_tu_split_cost,
1535
    me_func_selector_t *ps_func_selector)
1536
1537
52.6k
{
1538
52.6k
    WORD16 ai2_16x16_had[1024];
1539
52.6k
    WORD32 *pi4_32x32_hsad;
1540
52.6k
    WORD32 *pi4_32x32_tu_split;
1541
52.6k
    WORD32 *pi4_32x32_tu_early_cbf;
1542
1543
52.6k
    WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1544
52.6k
    WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1545
52.6k
    WORD32 tu_split_flag = 0;
1546
52.6k
    const UWORD8 u1_cur_tr_size = 32;
1547
52.6k
    WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1548
1549
    /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
1550
    /* cost_child : Stores the cost of the child HAD transform (16x16) */
1551
52.6k
    WORD32 cost_child = 0, cost_parent = 0;
1552
1553
    /*retuned as the best cost for the entire TU (32x32) */
1554
52.6k
    WORD32 best_cost = 0;
1555
    /*captures the best cost and tu_split at child level */
1556
52.6k
    WORD32 best_cost_tu_split;
1557
1558
    /* Initialize pointers to 4 8x8 blocks in 16x16 */
1559
52.6k
    WORD16 *pi2_y0 = ai2_16x16_had;
1560
52.6k
    WORD16 *pi2_y1 = ai2_16x16_had + 16;
1561
52.6k
    WORD16 *pi2_y2 = ai2_16x16_had + 32 * 16;
1562
52.6k
    WORD16 *pi2_y3 = ai2_16x16_had + 32 * 16 + 16;
1563
1564
52.6k
    UWORD8 *pu1_src0 = pu1_src;
1565
52.6k
    UWORD8 *pu1_src1 = pu1_src + 16;
1566
52.6k
    UWORD8 *pu1_src2 = pu1_src + src_strd * 16;
1567
52.6k
    UWORD8 *pu1_src3 = pu1_src + src_strd * 16 + 16;
1568
1569
52.6k
    UWORD8 *pu1_pred0 = pu1_pred;
1570
52.6k
    UWORD8 *pu1_pred1 = pu1_pred + 16;
1571
52.6k
    UWORD8 *pu1_pred2 = pu1_pred + pred_strd * 16;
1572
52.6k
    UWORD8 *pu1_pred3 = pu1_pred + pred_strd * 16 + 16;
1573
1574
52.6k
    ASSERT(pos_x >= 0);
1575
52.6k
    ASSERT(pos_y >= 0);
1576
1577
    /* Initialize pointers to store 32x32 SATDs */
1578
52.6k
    pi4_32x32_hsad = ppi4_hsad[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1579
1580
52.6k
    pi4_32x32_tu_split =
1581
52.6k
        ppi4_tu_split[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1582
1583
52.6k
    pi4_32x32_tu_early_cbf =
1584
52.6k
        ppi4_tu_early_cbf[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1585
1586
    /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1587
52.6k
    best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1588
52.6k
        pu1_src0,
1589
52.6k
        src_strd,
1590
52.6k
        pu1_pred0,
1591
52.6k
        pred_strd,
1592
52.6k
        pi2_y0,
1593
52.6k
        32,
1594
52.6k
        ppi4_hsad,
1595
52.6k
        ppi4_tu_split,
1596
52.6k
        ppi4_tu_early_cbf,
1597
52.6k
        pos_x_y_4x4,
1598
52.6k
        num_4x4_in_row,
1599
52.6k
        lambda,
1600
52.6k
        lambda_q_shift,
1601
52.6k
        i4_frm_qstep,
1602
52.6k
        i4_cur_depth + 1,
1603
52.6k
        i4_max_depth,
1604
52.6k
        i4_max_tr_size,
1605
52.6k
        pi4_tu_split_cost,
1606
52.6k
        NULL);
1607
1608
    /* cost is shifted by 10bits */
1609
52.6k
    best_cost = best_cost_tu_split >> 10;
1610
1611
    /* Tu split is present in the 6-10 bits */
1612
52.6k
    tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1613
1614
    /*Early CBF info is present in the last 5 bits */
1615
52.6k
    i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1616
1617
52.6k
    tu_split_flag <<= 5;
1618
52.6k
    i4_early_cbf_flag <<= 5;
1619
1620
52.6k
    cost_child += best_cost;
1621
1622
52.6k
    best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1623
52.6k
        pu1_src1,
1624
52.6k
        src_strd,
1625
52.6k
        pu1_pred1,
1626
52.6k
        pred_strd,
1627
52.6k
        pi2_y1,
1628
52.6k
        32,
1629
52.6k
        ppi4_hsad,
1630
52.6k
        ppi4_tu_split,
1631
52.6k
        ppi4_tu_early_cbf,
1632
52.6k
        pos_x_y_4x4 + 4,
1633
52.6k
        num_4x4_in_row,
1634
52.6k
        lambda,
1635
52.6k
        lambda_q_shift,
1636
52.6k
        i4_frm_qstep,
1637
52.6k
        i4_cur_depth + 1,
1638
52.6k
        i4_max_depth,
1639
52.6k
        i4_max_tr_size,
1640
52.6k
        pi4_tu_split_cost,
1641
52.6k
        NULL);
1642
1643
    /* cost is shifted by 10bits */
1644
52.6k
    best_cost = best_cost_tu_split >> 10;
1645
1646
    /* Tu split is present in the 6-10 bits */
1647
52.6k
    tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1648
1649
    /*Early CBF info is present in the last 5 bits */
1650
52.6k
    i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1651
1652
52.6k
    tu_split_flag <<= 5;
1653
52.6k
    i4_early_cbf_flag <<= 5;
1654
1655
52.6k
    cost_child += best_cost;
1656
1657
52.6k
    best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1658
52.6k
        pu1_src2,
1659
52.6k
        src_strd,
1660
52.6k
        pu1_pred2,
1661
52.6k
        pred_strd,
1662
52.6k
        pi2_y2,
1663
52.6k
        32,
1664
52.6k
        ppi4_hsad,
1665
52.6k
        ppi4_tu_split,
1666
52.6k
        ppi4_tu_early_cbf,
1667
52.6k
        pos_x_y_4x4 + (4 << 16),
1668
52.6k
        num_4x4_in_row,
1669
52.6k
        lambda,
1670
52.6k
        lambda_q_shift,
1671
52.6k
        i4_frm_qstep,
1672
52.6k
        i4_cur_depth + 1,
1673
52.6k
        i4_max_depth,
1674
52.6k
        i4_max_tr_size,
1675
52.6k
        pi4_tu_split_cost,
1676
52.6k
        NULL);
1677
1678
    /* cost is shifted by 10bits */
1679
52.6k
    best_cost = best_cost_tu_split >> 10;
1680
1681
    /* Tu split is present in the 6-10 bits */
1682
52.6k
    tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1683
1684
    /*Early CBF info is present in the last 5 bits */
1685
52.6k
    i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1686
1687
52.6k
    tu_split_flag <<= 5;
1688
52.6k
    i4_early_cbf_flag <<= 5;
1689
1690
52.6k
    cost_child += best_cost;
1691
1692
52.6k
    best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1693
52.6k
        pu1_src3,
1694
52.6k
        src_strd,
1695
52.6k
        pu1_pred3,
1696
52.6k
        pred_strd,
1697
52.6k
        pi2_y3,
1698
52.6k
        32,
1699
52.6k
        ppi4_hsad,
1700
52.6k
        ppi4_tu_split,
1701
52.6k
        ppi4_tu_early_cbf,
1702
52.6k
        pos_x_y_4x4 + (4 << 16) + 4,
1703
52.6k
        num_4x4_in_row,
1704
52.6k
        lambda,
1705
52.6k
        lambda_q_shift,
1706
52.6k
        i4_frm_qstep,
1707
52.6k
        i4_cur_depth + 1,
1708
52.6k
        i4_max_depth,
1709
52.6k
        i4_max_tr_size,
1710
52.6k
        pi4_tu_split_cost,
1711
52.6k
        NULL);
1712
1713
    /* cost is shifted by 10bits */
1714
52.6k
    best_cost = best_cost_tu_split >> 10;
1715
1716
    /* Tu split is present in the 6-10 bits */
1717
52.6k
    tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1718
1719
    /*Early CBF info is present in the last 5 bits */
1720
52.6k
    i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1721
1722
52.6k
    tu_split_flag <<= 1;
1723
52.6k
    i4_early_cbf_flag <<= 1;
1724
1725
52.6k
    cost_child += best_cost;
1726
1727
52.6k
    {
1728
52.6k
        UWORD32 u4_satd = 0;
1729
1730
52.6k
        u4_satd = ps_func_selector->pf_compute_32x32HAD_using_16x16(
1731
52.6k
            pi2_y0, 32, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1732
1733
52.6k
        cost_parent = ((u4_satd + 2) >> 2);
1734
52.6k
    }
1735
1736
    /* 4 TU_Split flags , 4 CBF Flags*/
1737
52.6k
    cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1738
1739
52.6k
    i4_early_cbf_flag += early_cbf;
1740
1741
    /* 1 TU_SPlit flag, 1 CBF flag */
1742
    //cost_parent += ((1 + 1)* lambda) >>  (lambda_q_shift + 1);
1743
1744
52.6k
    if(i4_cur_depth < i4_max_depth)
1745
49.7k
    {
1746
49.7k
        if((cost_child < cost_parent) || (u1_cur_tr_size > i4_max_tr_size))
1747
18.2k
        {
1748
18.2k
            *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1749
18.2k
            best_cost = cost_child;
1750
18.2k
            tu_split_flag++;
1751
18.2k
        }
1752
31.4k
        else
1753
31.4k
        {
1754
31.4k
            tu_split_flag = 0;
1755
31.4k
            best_cost = cost_parent;
1756
31.4k
        }
1757
49.7k
    }
1758
2.93k
    else
1759
2.93k
    {
1760
2.93k
        tu_split_flag = 0;
1761
2.93k
        best_cost = cost_parent;
1762
2.93k
    }
1763
1764
52.6k
    pi4_32x32_tu_split[0] = tu_split_flag;
1765
1766
52.6k
    pi4_32x32_hsad[0] = best_cost;
1767
1768
52.6k
    pi4_32x32_tu_early_cbf[0] = i4_early_cbf_flag;
1769
52.6k
}